From be8fd315a7baf7d63ef0c39e8116740b9dcea872 Mon Sep 17 00:00:00 2001 From: Abhishek Divekar Date: Thu, 23 Jan 2025 12:00:28 +0530 Subject: [PATCH] [fmcore] Migrations: (i) modules to bears (ii) Pydantic 1.10.15 to >=2.10.5 (ii) external autoenum package. - Moved utils to bears PyPI package - Moved data processing to bears PyPI package - Moved (most) constants to bears PyPI package - Refactored to use external autoenum PyPI package - Pydantic v2 migration: Replaced validator with model_ validator - Pydantic v2 migration: Replaced root_validator with model_ validator - Pydantic v2 migration: Replaced Config class with model_config - Removed "from typing import *" usage. - Various minor fixes. - Moved code to "private" modules. - Removed requirements.txt as it is no longer used for dependency management. --- .gitignore | 3 +- pyproject.toml | 31 +- requirements.txt | 286 - src/fmcore/__init__.py | 3 +- src/fmcore/algorithm/__init__.py | 2 +- src/fmcore/algorithm/bedrock.py | 43 +- .../algorithm/huggingface/transformers.py | 89 +- src/fmcore/algorithm/langchain.py | 16 +- src/fmcore/algorithm/sentence_transformers.py | 17 +- .../algorithm/sklearn/SklearnSGDClassifier.py | 9 +- .../algorithm/sklearn/SklearnSGDRegressor.py | 8 +- .../constants/DataProcessingConstants.py | 168 - src/fmcore/constants/FileConstants.py | 191 - src/fmcore/constants/MLConstants.py | 127 - ...thmConstants.py => _AlgorithmConstants.py} | 2 +- ...MetricConstants.py => _MetricConstants.py} | 4 +- src/fmcore/constants/_TaskConstants.py | 41 + ...onstants.py => _VisualizationConstants.py} | 9 +- src/fmcore/constants/__init__.py | 12 +- src/fmcore/data/FileMetadata.py | 352 - src/fmcore/data/__init__.py | 9 +- src/fmcore/data/asset.py | 145 - src/fmcore/data/pipeline.py | 1130 -- src/fmcore/data/processor/DataProcessor.py | 101 - .../data/processor/Nto1ColumnProcessor.py | 52 - .../data/processor/SingleColumnProcessor.py | 60 - src/fmcore/data/processor/__init__.py | 8 - .../CategoricalMissingValueImputation.py | 68 - .../data/processor/categorical/LabelAffix.py | 35 - .../processor/categorical/LabelEncoding.py | 206 - .../data/processor/categorical/__init__.py | 3 - src/fmcore/data/processor/mixins.py | 137 - .../numeric/NumericMissingValueImputation.py | 75 - src/fmcore/data/processor/numeric/__init__.py | 1 - .../data/processor/text/CaseTransformation.py | 30 - .../data/processor/text/HtmlTagRemoval.py | 19 - .../data/processor/text/PunctuationCleaner.py | 23 - .../data/processor/text/RegexSubstitution.py | 53 - .../data/processor/text/StringRemoval.py | 31 - .../data/processor/text/TFIDFVectorization.py | 66 - .../data/processor/text/TextConcatenation.py | 90 - src/fmcore/data/processor/text/__init__.py | 7 - .../data/processor/vector/VectorAssembler.py | 63 - .../data/processor/vector/VectorDensifier.py | 33 - src/fmcore/data/processor/vector/__init__.py | 2 - src/fmcore/data/reader.py | 1 + src/fmcore/data/writer.py | 81 + src/fmcore/framework/__init__.py | 22 +- .../framework/{algorithm.py => _algorithm.py} | 55 +- .../framework/{chain => _chain}/Chain.py | 40 +- src/fmcore/framework/_chain/__init__.py | 1 + .../framework/{task_data.py => _dataset.py} | 32 +- .../AccelerateEvaluator.py | 24 +- .../{evaluator => _evaluator}/Evaluator.py | 46 +- .../LocalEvaluator.py | 25 +- .../{evaluator => _evaluator}/RayEvaluator.py | 70 +- src/fmcore/framework/_evaluator/__init__.py | 4 + .../framework/{metric.py => _metric.py} | 48 +- .../{predictions.py => _predictions.py} | 48 +- src/fmcore/framework/_task/__init__.py | 7 + .../{task => _task}/classification.py | 23 +- .../{task => _task}/dense_retrieval.py | 65 +- .../framework/{task => _task}/embedding.py | 14 +- .../framework/{task => _task}/ranking.py | 19 +- .../framework/{task => _task}/regression.py | 14 +- .../framework/{task => _task}/retrieval.py | 48 +- .../{task => _task}/sparse_retrieval.py | 71 +- .../{task => _task}/text_generation.py | 63 +- .../framework/{mixins.py => _task_mixins.py} | 112 +- .../{tracker => _tracker}/AimTracker.py | 13 +- .../{tracker => _tracker}/LogFileTracker.py | 16 +- .../{tracker => _tracker}/Tracker.py | 32 +- .../{tracker => _tracker}/__init__.py | 12 +- .../AccelerateTrainer.py | 25 +- .../{trainer => _trainer}/LocalTrainer.py | 21 +- .../{trainer => _trainer}/RayTuneTrainer.py | 55 +- .../{trainer => _trainer}/Trainer.py | 37 +- src/fmcore/framework/_trainer/__init__.py | 4 + .../framework/{visualize.py => _visualize.py} | 36 +- src/fmcore/framework/chain/__init__.py | 1 - src/fmcore/framework/dl/__init__.py | 1 + src/fmcore/framework/dl/torch/__init__.py | 6 +- .../torch/{torch_base.py => _torch_base.py} | 37 +- ...chTaskDataDataset.py => _torch_dataset.py} | 15 +- .../torch/{torch_tasks.py => _torch_tasks.py} | 40 +- src/fmcore/framework/evaluator/__init__.py | 4 - src/fmcore/framework/task/__init__.py | 7 - src/fmcore/framework/trainer/__init__.py | 4 - src/fmcore/metric/__init__.py | 2 +- src/fmcore/metric/classification_metrics.py | 16 +- src/fmcore/metric/regression_metrics.py | 1 - src/fmcore/metric/text_generation_metrics.py | 137 +- src/fmcore/util/__init__.py | 10 - src/fmcore/util/aws/__init__.py | 1 - src/fmcore/util/aws/s3.py | 559 - src/fmcore/util/concurrency/__init__.py | 8 - src/fmcore/util/concurrency/_asyncio.py | 69 - src/fmcore/util/concurrency/_daemon.py | 120 - src/fmcore/util/concurrency/_dispatch.py | 400 - src/fmcore/util/concurrency/_processes.py | 405 - src/fmcore/util/concurrency/_ray.py | 367 - src/fmcore/util/concurrency/_threads.py | 298 - src/fmcore/util/concurrency/_utils.py | 466 - src/fmcore/util/environment.py | 50 - src/fmcore/util/filesystem.py | 454 - src/fmcore/util/jupyter.py | 220 - src/fmcore/util/language/__init__.py | 13 - src/fmcore/util/language/_alias.py | 374 - src/fmcore/util/language/_autoenum.py | 309 - src/fmcore/util/language/_function.py | 286 - src/fmcore/util/language/_import.py | 131 - src/fmcore/util/language/_iter.py | 101 - src/fmcore/util/language/_math.py | 117 - src/fmcore/util/language/_pbar.py | 255 - src/fmcore/util/language/_selection.py | 409 - src/fmcore/util/language/_string.py | 10099 ---------------- src/fmcore/util/language/_structs.py | 904 -- src/fmcore/util/language/_testing.py | 16 - src/fmcore/util/language/_typing.py | 620 - src/fmcore/util/language/_utils.py | 122 - src/fmcore/util/logging.py | 241 - src/fmcore/util/notify.py | 170 - src/fmcore/util/profiling.py | 252 - src/fmcore/util/schema.py | 753 -- src/fmcore/util/struct.py | 138 - tests/test_imports.py | 6 +- 126 files changed, 1099 insertions(+), 22959 deletions(-) delete mode 100644 requirements.txt delete mode 100644 src/fmcore/constants/DataProcessingConstants.py delete mode 100644 src/fmcore/constants/FileConstants.py delete mode 100755 src/fmcore/constants/MLConstants.py rename src/fmcore/constants/{AlgorithmConstants.py => _AlgorithmConstants.py} (94%) rename src/fmcore/constants/{MetricConstants.py => _MetricConstants.py} (80%) create mode 100755 src/fmcore/constants/_TaskConstants.py rename src/fmcore/constants/{VisualizationConstants.py => _VisualizationConstants.py} (89%) delete mode 100644 src/fmcore/data/FileMetadata.py delete mode 100644 src/fmcore/data/asset.py delete mode 100644 src/fmcore/data/pipeline.py delete mode 100644 src/fmcore/data/processor/DataProcessor.py delete mode 100644 src/fmcore/data/processor/Nto1ColumnProcessor.py delete mode 100644 src/fmcore/data/processor/SingleColumnProcessor.py delete mode 100644 src/fmcore/data/processor/__init__.py delete mode 100644 src/fmcore/data/processor/categorical/CategoricalMissingValueImputation.py delete mode 100644 src/fmcore/data/processor/categorical/LabelAffix.py delete mode 100644 src/fmcore/data/processor/categorical/LabelEncoding.py delete mode 100644 src/fmcore/data/processor/categorical/__init__.py delete mode 100644 src/fmcore/data/processor/mixins.py delete mode 100644 src/fmcore/data/processor/numeric/NumericMissingValueImputation.py delete mode 100644 src/fmcore/data/processor/numeric/__init__.py delete mode 100644 src/fmcore/data/processor/text/CaseTransformation.py delete mode 100644 src/fmcore/data/processor/text/HtmlTagRemoval.py delete mode 100644 src/fmcore/data/processor/text/PunctuationCleaner.py delete mode 100644 src/fmcore/data/processor/text/RegexSubstitution.py delete mode 100644 src/fmcore/data/processor/text/StringRemoval.py delete mode 100644 src/fmcore/data/processor/text/TFIDFVectorization.py delete mode 100755 src/fmcore/data/processor/text/TextConcatenation.py delete mode 100644 src/fmcore/data/processor/text/__init__.py delete mode 100644 src/fmcore/data/processor/vector/VectorAssembler.py delete mode 100644 src/fmcore/data/processor/vector/VectorDensifier.py delete mode 100644 src/fmcore/data/processor/vector/__init__.py create mode 100644 src/fmcore/data/reader.py create mode 100644 src/fmcore/data/writer.py rename src/fmcore/framework/{algorithm.py => _algorithm.py} (96%) rename src/fmcore/framework/{chain => _chain}/Chain.py (98%) create mode 100644 src/fmcore/framework/_chain/__init__.py rename src/fmcore/framework/{task_data.py => _dataset.py} (96%) rename src/fmcore/framework/{evaluator => _evaluator}/AccelerateEvaluator.py (97%) rename src/fmcore/framework/{evaluator => _evaluator}/Evaluator.py (96%) rename src/fmcore/framework/{evaluator => _evaluator}/LocalEvaluator.py (91%) rename src/fmcore/framework/{evaluator => _evaluator}/RayEvaluator.py (97%) create mode 100644 src/fmcore/framework/_evaluator/__init__.py rename src/fmcore/framework/{metric.py => _metric.py} (97%) rename src/fmcore/framework/{predictions.py => _predictions.py} (97%) create mode 100644 src/fmcore/framework/_task/__init__.py rename src/fmcore/framework/{task => _task}/classification.py (98%) rename src/fmcore/framework/{task => _task}/dense_retrieval.py (94%) rename src/fmcore/framework/{task => _task}/embedding.py (81%) rename src/fmcore/framework/{task => _task}/ranking.py (68%) rename src/fmcore/framework/{task => _task}/regression.py (85%) rename src/fmcore/framework/{task => _task}/retrieval.py (92%) rename src/fmcore/framework/{task => _task}/sparse_retrieval.py (98%) rename src/fmcore/framework/{task => _task}/text_generation.py (97%) rename src/fmcore/framework/{mixins.py => _task_mixins.py} (97%) rename src/fmcore/framework/{tracker => _tracker}/AimTracker.py (96%) rename src/fmcore/framework/{tracker => _tracker}/LogFileTracker.py (95%) rename src/fmcore/framework/{tracker => _tracker}/Tracker.py (95%) rename src/fmcore/framework/{tracker => _tracker}/__init__.py (57%) rename src/fmcore/framework/{trainer => _trainer}/AccelerateTrainer.py (98%) rename src/fmcore/framework/{trainer => _trainer}/LocalTrainer.py (93%) rename src/fmcore/framework/{trainer => _trainer}/RayTuneTrainer.py (99%) rename src/fmcore/framework/{trainer => _trainer}/Trainer.py (98%) create mode 100644 src/fmcore/framework/_trainer/__init__.py rename src/fmcore/framework/{visualize.py => _visualize.py} (97%) delete mode 100644 src/fmcore/framework/chain/__init__.py rename src/fmcore/framework/dl/torch/{torch_base.py => _torch_base.py} (96%) rename src/fmcore/framework/dl/torch/{PyTorchTaskDataDataset.py => _torch_dataset.py} (97%) rename src/fmcore/framework/dl/torch/{torch_tasks.py => _torch_tasks.py} (94%) delete mode 100644 src/fmcore/framework/evaluator/__init__.py delete mode 100644 src/fmcore/framework/task/__init__.py delete mode 100644 src/fmcore/framework/trainer/__init__.py delete mode 100644 src/fmcore/util/__init__.py delete mode 100644 src/fmcore/util/aws/__init__.py delete mode 100644 src/fmcore/util/aws/s3.py delete mode 100644 src/fmcore/util/concurrency/__init__.py delete mode 100644 src/fmcore/util/concurrency/_asyncio.py delete mode 100644 src/fmcore/util/concurrency/_daemon.py delete mode 100644 src/fmcore/util/concurrency/_dispatch.py delete mode 100644 src/fmcore/util/concurrency/_processes.py delete mode 100644 src/fmcore/util/concurrency/_ray.py delete mode 100644 src/fmcore/util/concurrency/_threads.py delete mode 100644 src/fmcore/util/concurrency/_utils.py delete mode 100644 src/fmcore/util/environment.py delete mode 100644 src/fmcore/util/filesystem.py delete mode 100644 src/fmcore/util/jupyter.py delete mode 100644 src/fmcore/util/language/__init__.py delete mode 100644 src/fmcore/util/language/_alias.py delete mode 100644 src/fmcore/util/language/_autoenum.py delete mode 100644 src/fmcore/util/language/_function.py delete mode 100644 src/fmcore/util/language/_import.py delete mode 100644 src/fmcore/util/language/_iter.py delete mode 100644 src/fmcore/util/language/_math.py delete mode 100644 src/fmcore/util/language/_pbar.py delete mode 100644 src/fmcore/util/language/_selection.py delete mode 100644 src/fmcore/util/language/_string.py delete mode 100644 src/fmcore/util/language/_structs.py delete mode 100644 src/fmcore/util/language/_testing.py delete mode 100644 src/fmcore/util/language/_typing.py delete mode 100644 src/fmcore/util/language/_utils.py delete mode 100644 src/fmcore/util/logging.py delete mode 100644 src/fmcore/util/notify.py delete mode 100644 src/fmcore/util/profiling.py delete mode 100644 src/fmcore/util/schema.py delete mode 100644 src/fmcore/util/struct.py diff --git a/.gitignore b/.gitignore index 8ad2a00..ec2de90 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,5 @@ __pycache__/ /doc/_apidoc/ *.swp -.vscode/settings.json \ No newline at end of file +.vscode/settings.json +*.ipynb \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 02ccb98..4cc3e80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,41 +10,31 @@ authors = [ ] description = "A specialized toolkit for scaling experimental research with Foundation Models." readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.11.11" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] license-files = ["LICENSE"] dependencies = [ + "autoenum", + "bears", "requests", "pyyaml", "urllib3", - "pandas==1.*", - "numpy==1.*", - "pydantic==1.10.15", - "scikit-learn", - "xlrd", - "XlsxWriter", - "openpyxl", - "fastparquet", - "pyarrow", - "s3fs", - "tqdm", - "boto3", - "cloudpickle>=3.0.0", ] [project.optional-dependencies] all = [ + "bears[all]", "pytest", "orjson", - "ray==2.9.2", + "ray", "ray[default]", "ray[tune]", "ray[serve]", "dask[complete]", - "dask==2024.2.0", + "dask", "gpustat", "nvitop", "altair", @@ -56,8 +46,8 @@ all = [ "hvplot>=0.10.0", "matplotlib", "tiktoken", - "torch==2.3.0", - "transformers==4.42.4", + "torch", + "transformers>=4.42.4", "einops", "accelerate", "deepspeed", @@ -110,7 +100,6 @@ ignore = [ "E731", # lambda-assignment: https://docs.astral.sh/ruff/rules/lambda-assignment/ "E741", # ambiguous-variable-name: https://docs.astral.sh/ruff/rules/ambiguous-variable-name/ - ## Ignored because of bad interaction with `from typing import *` - "F405", # undefined-local-with-import-star-usage: https://docs.astral.sh/ruff/rules/undefined-local-with-import-star-usage/ - "F403", # undefined-local-with-import-star: https://docs.astral.sh/ruff/rules/undefined-local-with-import-star/ + ## Ignored because it causes no harm (and is needed sometimes): + "F841", # unused-variable: https://docs.astral.sh/ruff/rules/unused-variable/ ] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a27175b..0000000 --- a/requirements.txt +++ /dev/null @@ -1,286 +0,0 @@ -absl-py==2.1.0 -accelerate @ git+https://github.com/huggingface/accelerate@b7fa2fa956f40e0b6f650d5eb1764680bf3fd8f7 -aim==3.20.1 -aim-ui==3.20.1 -aimrecords==0.0.7 -aimrocks==0.4.0 -aiobotocore==2.13.0 -aiofiles==23.2.1 -aiohttp==3.10.2 -aiohttp-cors==0.7.0 -aioitertools==0.11.0 -aiorwlock==1.4.0 -aiosignal==1.3.1 -alembic==1.13.1 -altair==5.3.0 -anyio==4.4.0 -argon2-cffi==23.1.0 -argon2-cffi-bindings==21.2.0 -arrow==1.3.0 -art==6.2 -asttokens==2.4.1 -async-lru==2.0.4 -attrs==23.2.0 -babel==2.15.0 -base58==2.0.1 -beautifulsoup4==4.12.3 -bitsandbytes==0.43.1 -bleach==6.1.0 -blessed==1.20.0 -blinker==1.8.2 -bokeh==3.4.1 -boto3==1.34.106 -botocore==1.34.106 -brotli==1.1.0 -cachetools==5.3.3 -certifi==2024.7.4 -cffi==1.16.0 -charset-normalizer==3.3.2 -click==8.1.7 -cloudpickle==3.0.0 -cmake==3.29.3 -colorcet==3.1.0 -colorful==0.5.6 -comm==0.2.2 -contourpy==1.2.1 -cramjam==2.8.3 -cryptography==42.0.8 -cycler==0.12.1 -dask==2024.2.0 -datasets==2.2.1 -debugpy==1.8.1 -decorator==5.1.1 -deepspeed==0.14.2 -defusedxml==0.7.1 -dill==0.3.8 -distlib==0.3.8 -distributed==2024.2.0 -docker-pycreds==0.4.0 -einops==0.8.0 -et-xmlfile==1.1.0 -evaluate==0.4.2 -executing==2.0.1 -faiss-cpu==1.8.0 -fastapi==0.109.1 -fastjsonschema==2.19.1 -fastparquet==2024.5.0 -filelock==3.14.0 -fonttools==4.53.0 -fqdn==1.5.1 -frozenlist==1.4.1 -fsspec==2024.6.0 -gitdb==4.0.11 -gitpython==3.1.43 -google-api-core==2.19.0 -google-auth==2.29.0 -googleapis-common-protos==1.63.1 -gpustat==1.1.1 -greenlet==3.0.3 -grpcio==1.64.1 -h11==0.14.0 -hjson==3.1.0 -holoviews==1.18.3 -httpcore==1.0.5 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub @ git+https://github.com/huggingface/huggingface_hub@919ce7d0ca281574a26cfa73cf242def95ac0119 -hvplot==0.10.0 -idna==3.7 -imageio==2.34.1 -importlib-metadata==7.1.0 -ipykernel==6.29.4 -ipython==8.25.0 -ipywidgets==8.1.3 -isoduration==20.11.0 -jedi==0.19.1 -jinja2==3.1.4 -jmespath==1.0.1 -joblib==1.4.2 -json5==0.9.25 -jsonpointer==2.4 -jsonschema==4.22.0 -jsonschema-specifications==2023.12.1 -jupyter==1.0.0 -jupyter-client==8.6.2 -jupyter-console==6.6.3 -jupyter-core==5.7.2 -jupyter-events==0.10.0 -jupyter-lsp==2.2.5 -jupyter-server==2.14.1 -jupyter-server-terminals==0.5.3 -jupyterlab==4.2.1 -jupyterlab-pygments==0.3.0 -jupyterlab-server==2.27.2 -jupyterlab-widgets==3.0.11 -kiwisolver==1.4.5 -linkify-it-py==2.0.3 -lit==18.1.6 -locket==1.0.0 -lz4==4.3.3 -mako==1.3.5 -markdown==3.6 -markdown-it-py==3.0.0 -markupsafe==2.1.5 -matplotlib==3.9.0 -matplotlib-inline==0.1.7 -mauve-text==0.3.0 -mdit-py-plugins==0.4.1 -mdurl==0.1.2 -mistune==3.0.2 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -multiprocess==0.70.16 -nbclient==0.10.0 -nbconvert==7.16.4 -nbformat==5.10.4 -nest-asyncio==1.6.0 -networkx==3.3 -ninja==1.11.1.1 -nltk==3.8.2 -notebook==7.2.0 -notebook-shim==0.2.4 -numpy==1.26.4 -nvidia-cublas-cu11==11.10.3.66 -nvidia-cuda-cupti-cu11==11.7.101 -nvidia-cuda-nvrtc-cu11==11.7.99 -nvidia-cuda-runtime-cu11==11.7.99 -nvidia-cudnn-cu11==8.5.0.96 -nvidia-cufft-cu11==10.9.0.58 -nvidia-curand-cu11==10.2.10.91 -nvidia-cusolver-cu11==11.4.0.1 -nvidia-cusparse-cu11==11.7.4.91 -nvidia-ml-py==12.535.161 -nvidia-nccl-cu11==2.14.3 -nvidia-nvtx-cu11==11.7.91 -nvitop==1.3.2 -opencensus==0.11.4 -opencensus-context==0.1.3 -openpyxl==3.1.3 -orjson==3.10.3 -overrides==7.7.0 -packaging==24.0 -pandas==1.5.3 -pandocfilters==1.5.1 -panel==1.4.4 -param==2.1.0 -parso==0.8.4 -partd==1.4.2 -patsy==0.5.6 -pexpect==4.9.0 -pillow==10.3.0 -pip==24.0 -platformdirs==4.2.2 -plotly==5.22.0 -plotly-express==0.4.1 -prometheus-client==0.20.0 -prompt-toolkit==3.0.46 -proto-plus==1.23.0 -protobuf==4.25.3 -psutil==5.9.8 -ptyprocess==0.7.0 -pure-eval==0.2.2 -py-cpuinfo==9.0.0 -py-spy==0.3.14 -pyarrow==16.1.0 -pyarrow-hotfix==0.6 -pyasn1==0.6.0 -pyasn1-modules==0.4.0 -pycparser==2.22 -pydantic==1.10.15 -pydeck==0.9.1 -pygments==2.18.0 -pynvml==11.5.0 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-json-logger==2.0.7 -pytz==2024.1 -pyviz-comms==3.0.2 -pyyaml==6.0.1 -pyzmq==26.0.3 -qtconsole==5.5.2 -qtpy==2.4.1 -ray==2.9.2 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -responses==0.18.0 -restrictedpython==7.1 -rfc3339-validator==0.1.4 -rfc3986-validator==0.1.1 -rich==13.7.1 -rpds-py==0.18.1 -rsa==4.9 -s3fs==2024.6.0 -s3transfer==0.10.1 -safetensors==0.4.3 -scikit-learn==1.5.0 -scipy==1.13.1 -seaborn==0.13.2 -send2trash==1.8.3 -sentence-transformers==3.0.0 -sentencepiece==0.2.0 -sentry-sdk==2.8.0 -setproctitle==1.3.3 -setuptools==70.0.0 -six==1.16.0 -smart-open==7.0.4 -smmap==5.0.1 -sniffio==1.3.1 -sortedcontainers==2.4.0 -soupsieve==2.5 -sqlalchemy==2.0.30 -stack-data==0.6.3 -starlette==0.36.2 -statsmodels==0.14.2 -streamlit==1.37.0 -sympy==1.12.1 -tabulate==0.9.0 -tblib==3.0.0 -tenacity==8.3.0 -tensorboard==2.16.2 -tensorboard-data-server==0.7.2 -tensorboardx==2.6.2.2 -termcolor==2.4.0 -terminado==0.18.1 -threadpoolctl==3.5.0 -tiktoken==0.7.0 -tinycss2==1.3.0 -tokenizers==0.19.1 -toml==0.10.2 -toolz==0.12.1 -torch==2.0.1 -tornado==6.4.1 -tqdm==4.66.4 -traitlets==5.14.3 -transformers==4.41.1 -triton==2.0.0 -types-python-dateutil==2.9.0.20240316 -typing-extensions==4.12.1 -uc-micro-py==1.0.3 -uri-template==1.3.0 -urllib3==2.2.2 -uv==0.2.6 -uvicorn==0.30.1 -uvloop==0.19.0 -virtualenv==20.26.2 -wandb==0.17.0 -watchdog==4.0.1 -watchfiles==0.22.0 -wcwidth==0.2.13 -webcolors==1.13 -webencodings==0.5.1 -websocket-client==1.8.0 -websockets==12.0 -werkzeug==3.0.3 -wheel==0.43.0 -widgetsnbextension==4.0.11 -wrapt==1.16.0 -xlrd==2.0.1 -xlsxwriter==3.2.0 -xxhash==3.4.1 -xyzservices==2024.4.0 -yarl==1.9.4 -zict==3.0.0 -zipp==3.19.2 \ No newline at end of file diff --git a/src/fmcore/__init__.py b/src/fmcore/__init__.py index 1026c70..060e9e1 100644 --- a/src/fmcore/__init__.py +++ b/src/fmcore/__init__.py @@ -1,5 +1,6 @@ ## Import in dependency order: -import fmcore.util +_LIBRARY_NAME: str = 'fmcore' +import bears.util import fmcore.constants import fmcore.data import fmcore.framework diff --git a/src/fmcore/algorithm/__init__.py b/src/fmcore/algorithm/__init__.py index 7b9c69f..eec692d 100644 --- a/src/fmcore/algorithm/__init__.py +++ b/src/fmcore/algorithm/__init__.py @@ -1,7 +1,7 @@ from importlib import import_module import os from pathlib import Path -from fmcore.util.language import String +from bears.util.language import String __THIS_FILE__ = __file__ ## Needed when calling reload() from outside this file. diff --git a/src/fmcore/algorithm/bedrock.py b/src/fmcore/algorithm/bedrock.py index 965afc3..0ac601d 100644 --- a/src/fmcore/algorithm/bedrock.py +++ b/src/fmcore/algorithm/bedrock.py @@ -1,19 +1,10 @@ import json -from typing import * +from typing import Any, ClassVar, Dict, List, Optional, Set, Union -from pydantic import confloat, conint, constr, root_validator - -from fmcore.constants import Parallelize -from fmcore.data import FileMetadata -from fmcore.framework.task.text_generation import ( - GENERATED_TEXTS_COL, - GenerativeLM, - Prompts, - TextGenerationParams, - TextGenerationParamsMapper, -) -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( Log, + String, accumulate, any_are_none, any_item, @@ -26,6 +17,16 @@ set_param_from_alias, stop_executor, ) +from pydantic import confloat, conint, constr, model_validator + +from fmcore.constants import Parallelize +from fmcore.framework._task.text_generation import ( + GENERATED_TEXTS_COL, + GenerativeLM, + Prompts, + TextGenerationParams, + TextGenerationParamsMapper, +) with optional_dependency("boto3"): import boto3 @@ -141,11 +142,17 @@ def call_bedrock( ) if "anthropic.claude-3" in model_name: generated_text: str = call_claude_v3( - bedrock=bedrock, prompt=prompt, model_name=model_name, **generation_params + bedrock=bedrock, + prompt=prompt, + model_name=model_name, + **generation_params, ) elif "claude" in model_name: generated_text: str = call_claude_v1_v2( - bedrock=bedrock, prompt=prompt, model_name=model_name, **generation_params + bedrock=bedrock, + prompt=prompt, + model_name=model_name, + **generation_params, ) else: bedrock_invoke_model_params = {"prompt": prompt, **generation_params} @@ -189,7 +196,8 @@ class Hyperparameters(GenerativeLM.Hyperparameters): max_workers: int = 1 generation_params: Union[TextGenerationParams, Dict, str] - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_bedrock_params(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -272,7 +280,8 @@ def prompt_model_with_retries(self, prompt: str) -> str: def predict_step(self, batch: Prompts, **kwargs) -> Any: generated_texts: List = [] - for prompt in batch.prompts().tolist(): ## Template has already been applied + for prompt in batch.prompts().tolist(): + ## Template has already been applied generated_text: Any = dispatch( self.prompt_model_with_retries, prompt, diff --git a/src/fmcore/algorithm/huggingface/transformers.py b/src/fmcore/algorithm/huggingface/transformers.py index a2cb121..caa77bd 100644 --- a/src/fmcore/algorithm/huggingface/transformers.py +++ b/src/fmcore/algorithm/huggingface/transformers.py @@ -4,17 +4,25 @@ from collections import OrderedDict from contextlib import contextmanager from math import inf -from typing import * +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np -from pydantic import Extra, root_validator -from pydantic.typing import Literal - -from fmcore.constants import MLType -from fmcore.data import FileMetadata -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( Alias, Parameters, + String, as_list, get_default, ignore_warnings, @@ -22,6 +30,9 @@ safe_validate_arguments, set_param_from_alias, ) +from pydantic import ConfigDict, model_validator + +from fmcore.constants import MLType with optional_dependency("torch", "sentencepiece", "transformers", "tokenizers", "huggingface_hub"): import huggingface_hub @@ -52,7 +63,11 @@ MODEL_MAPPING_NAMES, _BaseAutoModelClass, ) - from transformers.utils.logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled + from transformers.utils.logging import ( + disable_progress_bar, + enable_progress_bar, + is_progress_bar_enabled, + ) from fmcore.framework import Dataset from fmcore.framework.dl.torch import ( @@ -63,7 +78,7 @@ PyTorchClassifierMixin, PyTorchMultiLabelClassifierMixin, ) - from fmcore.framework.task.text_generation import ( + from fmcore.framework._task.text_generation import ( GENERATED_TEXTS_COL, GenerationOutputScoresFormat, GenerativeLM, @@ -95,7 +110,8 @@ class HFPyTorchModel(PyTorch, ABC): cache_dir: Optional[Union[FileMetadata, Dict, str]] = None init_empty: bool = False - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_aliases(cls, params: Dict) -> Dict: Alias.set_cache_dir(params) if params.get("cache_dir") is not None: @@ -133,7 +149,7 @@ def create_model( else: # print( # f"Loading tokenizer from: '{model_dir.path}' using params: " - # f"{self.hyperparams.model_config}" + # f"{self.hyperparams.model_config_}" # ) with disable_hf_logging(): return self.AutoModelClass.from_pretrained( @@ -141,7 +157,7 @@ def create_model( device_map=self.hyperparams.device_map, torch_dtype=self.hyperparams.torch_dtype, **{ - **self.hyperparams.model_config, + **self.hyperparams.model_config_, **dict( cache_dir=cache_dir, trust_remote_code=True, @@ -154,7 +170,7 @@ def create_model_config(self) -> PretrainedConfig: return AutoConfig.from_pretrained( self.hyperparams.model_name, **{ - **self.hyperparams.model_config, + **self.hyperparams.model_config_, **dict( cache_dir=self.cache_dir.path if self.cache_dir is not None else None, trust_remote_code=True, @@ -239,14 +255,16 @@ def download_model_to_cache_dir( ) class HFTokenizerConfig(Parameters): - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) tokenizer_name: Optional[str] = None pad_token: Optional[str] = None truncation_side: Literal["left", "right"] = "right" ## Keeps tokens at the start of the string - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_aliases(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -261,8 +279,9 @@ def set_aliases(cls, params: Dict) -> Dict: return params class HFTokenizerEncode(Parameters): - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) max_length: Optional[int] = None padding: Literal["longest", "max_length", "do_not_pad"] = "longest" ## Same as padding="True" @@ -270,7 +289,8 @@ class Config(Parameters.Config): "longest_first" ## Same as truncation="True" ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_aliases(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -285,8 +305,9 @@ def set_aliases(cls, params: Dict) -> Dict: return params class HFTokenizerDecode(Parameters): - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) skip_special_tokens: bool = True clean_up_tokenization_spaces: bool = True @@ -320,7 +341,8 @@ class Hyperparameters(PyTorch.Hyperparameters): tokenizer_config: HFTokenizerConfig = dict() tokenizer_encode: HFTokenizerEncode = dict() tokenizer_special_tokens: Optional[Dict] = None - model_config: Dict[str, Any] = dict() + ## "model_config" would clash with Pydantic v2's model_config, so adding underscore. + model_config_: Dict[str, Any] = dict() device_map: Optional[Union[Dict, str]] = None torch_dtype: Optional[Union[torch.dtype, str]] = None optimizer: Optimizer = dict( @@ -330,8 +352,18 @@ class Hyperparameters(PyTorch.Hyperparameters): eps=1e-8, ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_hf_text_model_params(cls, params: Dict) -> Dict: + ## Allow setting via "model_config" or "model_config_": + set_param_from_alias( + params, + param="model_config_", + alias=[ + "model_config", + ], + ) + set_param_from_alias( params, param="model_name", @@ -485,7 +517,8 @@ def embedding_size(self) -> int: @staticmethod def mean_pooling(model_output, attention_mask) -> Tensor: - token_embeddings = model_output[0] ## First element of model_output contains all token embeddings + ## First element of model_output contains all token embeddings: + token_embeddings = model_output[0] token_embeddings = token_embeddings.masked_fill(~attention_mask[..., None].bool(), 0.0) sentence_embeddings = token_embeddings.sum(dim=1) / attention_mask.sum(dim=1)[..., None] return sentence_embeddings @@ -504,7 +537,7 @@ def create_model_config(self) -> PretrainedConfig: return AutoConfig.from_pretrained( self.hyperparams.model_name, **{ - **self.hyperparams.model_config, + **self.hyperparams.model_config_, **dict( num_labels=self.num_labels, cache_dir=self.cache_dir.path if self.cache_dir is not None else None, @@ -624,7 +657,8 @@ class Hyperparameters(HFPyTorchTextModel.Hyperparameters): ) tokenizer_decode: HFTokenizerDecode = dict() - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_generative_lm_params(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -669,7 +703,8 @@ def forward(self, input: Dict, **kwargs) -> Dict: gen_kwargs: Dict = { **input, **self.hyperparams.generation_params.hf_dict(), - **dict(return_dict_in_generate=True), ## Always return a *DecoderOnlyOutput + ## Always return a *DecoderOnlyOutput: + **dict(return_dict_in_generate=True), } if self.stop_sequences is not None: gen_kwargs["stopping_criteria"] = HFSubstringMatchStoppingCriteria( diff --git a/src/fmcore/algorithm/langchain.py b/src/fmcore/algorithm/langchain.py index a24d553..1c46a2a 100644 --- a/src/fmcore/algorithm/langchain.py +++ b/src/fmcore/algorithm/langchain.py @@ -1,12 +1,11 @@ import os -from typing import * +from typing import Any, ClassVar, Dict, List, Literal, Optional, Type, Union -from pydantic import confloat, conint, root_validator -from pydantic.typing import Literal +from bears import FileMetadata +from bears.util import Log, MappedParameters, String, optional_dependency, retry +from pydantic import confloat, conint, model_validator -from fmcore.data import FileMetadata -from fmcore.framework.task.text_generation import GENERATED_TEXTS_COL, GenerativeLM, Prompts -from fmcore.util import Log, MappedParameters, optional_dependency, retry +from fmcore.framework._task.text_generation import GENERATED_TEXTS_COL, GenerativeLM, Prompts with optional_dependency("langchain"): from langchain import Anthropic, HuggingFaceHub, LLMChain, OpenAI, PromptTemplate @@ -14,7 +13,7 @@ from langchain.llms.base import BaseLLM class LangChainLLM(MappedParameters): - _mapping = { + mapping_dict: ClassVar[Dict[str, Type]] = { "OpenAI": OpenAI, "ChatOpenAI": ChatOpenAI, "Anthropic": Anthropic, @@ -33,7 +32,8 @@ class Hyperparameters(GenerativeLM.Hyperparameters): retry_wait: confloat(ge=0) = 30.0 retry_jitter: confloat(ge=0) = 0.25 - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_langchain_params(cls, params: Dict) -> Dict: params["batch_size"] = 1 params["llm"]: LangChainLLM = LangChainLLM.of(params["llm"]) diff --git a/src/fmcore/algorithm/sentence_transformers.py b/src/fmcore/algorithm/sentence_transformers.py index 75571c6..8b86591 100644 --- a/src/fmcore/algorithm/sentence_transformers.py +++ b/src/fmcore/algorithm/sentence_transformers.py @@ -1,13 +1,17 @@ import os -from typing import * +from typing import ( + Dict, + List, + Optional, +) -from pydantic import root_validator +from bears import FileMetadata +from bears.util import optional_dependency, set_param_from_alias +from pydantic import model_validator from fmcore.constants import MLType, Storage -from fmcore.data import FileMetadata from fmcore.framework.dl.torch import PyTorchBaseModel -from fmcore.framework.task import EmbeddingData -from fmcore.util import optional_dependency, set_param_from_alias +from fmcore.framework._task import EmbeddingData with optional_dependency("torch", "transformers"): import torch @@ -30,7 +34,8 @@ class Hyperparameters(PyTorchBaseModel.Hyperparameters): padding: bool = True truncation: bool = True - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_aliases(cls, params: Dict) -> Dict: set_param_from_alias( params, diff --git a/src/fmcore/algorithm/sklearn/SklearnSGDClassifier.py b/src/fmcore/algorithm/sklearn/SklearnSGDClassifier.py index e8b86c9..ffcff63 100644 --- a/src/fmcore/algorithm/sklearn/SklearnSGDClassifier.py +++ b/src/fmcore/algorithm/sklearn/SklearnSGDClassifier.py @@ -1,13 +1,16 @@ import os -from typing import * +from typing import ( + Dict, + Optional, +) import numpy as np import pandas as pd +from bears import FileMetadata +from bears.util import optional_dependency from fmcore.constants import MLType, Storage -from fmcore.data import FileMetadata from fmcore.framework import ClassificationData, Classifier, EncodingRange -from fmcore.util import optional_dependency with optional_dependency("sklearn", "joblib", error="raise"): import joblib diff --git a/src/fmcore/algorithm/sklearn/SklearnSGDRegressor.py b/src/fmcore/algorithm/sklearn/SklearnSGDRegressor.py index 4ecc90e..b05abd9 100644 --- a/src/fmcore/algorithm/sklearn/SklearnSGDRegressor.py +++ b/src/fmcore/algorithm/sklearn/SklearnSGDRegressor.py @@ -1,13 +1,15 @@ import os -from typing import * +from typing import ( + Optional, +) import numpy as np import pandas as pd +from bears import FileMetadata +from bears.util import optional_dependency from fmcore.constants import MLType, Storage -from fmcore.data import FileMetadata from fmcore.framework import RegressionData, Regressor -from fmcore.util import optional_dependency with optional_dependency("sklearn", "joblib"): import joblib diff --git a/src/fmcore/constants/DataProcessingConstants.py b/src/fmcore/constants/DataProcessingConstants.py deleted file mode 100644 index c1a4b3d..0000000 --- a/src/fmcore/constants/DataProcessingConstants.py +++ /dev/null @@ -1,168 +0,0 @@ -import csv -from typing import * - -import numpy as np - -from fmcore.constants.MLConstants import MLType -from fmcore.util import AutoEnum, alias, as_list, auto, optional_dependency - -DEFAULT_RANDOM_SEED: int = ( - 42 ## https://en.wikipedia.org/wiki/42_(number)#The_Hitchhiker's_Guide_to_the_Galaxy -) - - -class DataLayout(AutoEnum): - DATUM = auto() - LIST_OF_DICT = auto() ## List dicts with various columns (sparse storage). Fast row-wise access. - DICT = ( - auto() - ) ## Single Dict with Numpy Arrays or Tensorts for columns (dense storage). Fast column-wise access. - RECORD = ( - auto() - ) ## Single Dict with Numpy Arrays or Tensorts for columns (dense storage). Fast column-wise access. - NUMPY = auto() ## Numpy array (dense storage). Useful for row-wise access. - TORCH = auto() - TENSORFLOW = auto() - JAX = auto() - NUMPY_RECORD_ARRAY = auto() ## Numpy array of tuples (dense storage). Fast row-wise access. - PANDAS = auto() ## Numpy array with extra metadata (dense storage). Fast row-wise or column-wise access. - DASK = auto() ## Lazily-evaluated DataFrame (dense storage). Fast column-wise access. - - -SDF_DATA_LAYOUT_PRIORITY: List[DataLayout] = [ - ## Do not include DataLayout.RECORD in this. - DataLayout.DICT, - DataLayout.LIST_OF_DICT, - DataLayout.PANDAS, - DataLayout.DASK, -] -LAZY_SDF_DATA_LAYOUTS: List[DataLayout] = [ - DataLayout.DASK, -] - -SS_DATA_LAYOUT_PRIORITY: List[DataLayout] = [ - DataLayout.NUMPY, - DataLayout.PANDAS, - DataLayout.DASK, -] - -TENSOR_SS_DATA_LAYOUT_PRIORITY: List[DataLayout] = [ - DataLayout.TORCH, - DataLayout.TENSORFLOW, - DataLayout.JAX, -] - -AVAILABLE_TENSOR_TYPES: Dict[DataLayout, Type] = {DataLayout.NUMPY: np.ndarray} -with optional_dependency("torch"): - import torch - - AVAILABLE_TENSOR_TYPES[DataLayout.TORCH] = torch.Tensor - -with optional_dependency("tensorflow"): - import tensorflow as tf - - AVAILABLE_TENSOR_TYPES[DataLayout.TENSORFLOW] = tf.Tensor - -with optional_dependency("jax", "flax"): - import jax.numpy as jnp - - AVAILABLE_TENSOR_TYPES[DataLayout.JAX] = jnp.ndarray - -AVAILABLE_DEEP_LEARNING_PACKAGES: Set[DataLayout] = set(AVAILABLE_TENSOR_TYPES.keys()) - -TENSOR_LAYOUT_TO_SHORTHAND_MAP: Dict[DataLayout, List[str]] = { - DataLayout.NUMPY: ["np", "numpy"], - DataLayout.TORCH: ["pt", "torch", "pytorch"], - DataLayout.TENSORFLOW: ["tf", "tensorflow"], - DataLayout.JAX: ["jax"], -} -TensorShortHand = Literal["np", "numpy", "pt", "torch", "pytorch", "tf", "tensorflow", "jax"] - -SHORTHAND_TO_TENSOR_LAYOUT_MAP: Dict[str, DataLayout] = {} -for tensor_layout, shorthand in TENSOR_LAYOUT_TO_SHORTHAND_MAP.items(): - for sh in as_list(shorthand): - if sh in SHORTHAND_TO_TENSOR_LAYOUT_MAP: - raise ValueError(f"Cannot have duplicate file-ending keys: {sh}") - SHORTHAND_TO_TENSOR_LAYOUT_MAP[sh] = tensor_layout - - -class ProcessingMode(AutoEnum): - TRANSFORM = auto() - FIT_TRANSFORM = auto() - ZIPPING = auto() - TRANSFORM_SINGLE_ROW = auto() - - def get_data_layout(self): - return DataLayout.RECORD if self.name is ProcessingMode.TRANSFORM_SINGLE_ROW else None - - -class MissingColumnBehavior(AutoEnum): - ERROR = auto() - SKIP = auto() - EXECUTE = auto() - - -class Parallelize(AutoEnum): - asyncio = alias("async", "asynchronous") - sync = alias("synchronous") - threads = alias("thread") - processes = alias("proc", "process") - ray = auto() - - -QUOTING_MAP: Dict = { - "quote_none": csv.QUOTE_NONE, - csv.QUOTE_NONE: csv.QUOTE_NONE, - "quote_minimal": csv.QUOTE_MINIMAL, - csv.QUOTE_MINIMAL: csv.QUOTE_MINIMAL, - "quote_nonnumeric": csv.QUOTE_NONNUMERIC, - csv.QUOTE_NONNUMERIC: csv.QUOTE_NONNUMERIC, - "quote_all": csv.QUOTE_ALL, - csv.QUOTE_ALL: csv.QUOTE_ALL, -} - -DASK_APPLY_OUTPUT_MLTYPE_TO_META_MAP = { - MLType.BOOL: bool, - MLType.TEXT: str, - MLType.INT: int, - MLType.FLOAT: float, - MLType.VECTOR: list, -} - - -class DataPosition(AutoEnum): - START = auto() - MIDDLE = auto() - END = auto() - - -class AggregationStrategy(AutoEnum): - AVERAGE = auto() - MIN = auto() - MAX = auto() - MEDIAN = auto() - MODE = auto() - NONE = auto() - - -class CompressionEngine(AutoEnum): - BROTLI = auto() - GZIP = auto() - - -class Status(AutoEnum): - PENDING = alias("SCHEDULED") ## The job has not yet started executing - RUNNING = auto() ## The job is currently running. - STOPPED = auto() ## The job was intentionally stopped by the user. - SUCCEEDED = alias("SUCCESS", "SUCCESSFUL") ## The job finished successfully. - FAILED = auto() ## The job failed. - - -COMPLETED_STATUSES: Set[Status] = {Status.STOPPED, Status.SUCCEEDED, Status.FAILED} - - -class FailureAction(AutoEnum): - ERROR = auto() - ERROR_DELAYED = auto() - WARN = auto() - IGNORE = auto() diff --git a/src/fmcore/constants/FileConstants.py b/src/fmcore/constants/FileConstants.py deleted file mode 100644 index 194fd62..0000000 --- a/src/fmcore/constants/FileConstants.py +++ /dev/null @@ -1,191 +0,0 @@ -from typing import * - -from fmcore.util import AutoEnum, as_list, auto - -UNKNOWN_LABEL_FILL: str = "__UNKNOWN__LABEL__" - - -class FileFormat(AutoEnum): - ## Config: - YAML = auto() - JSON = auto() - ## Dataframe: - CSV = auto() - TSV = auto() - PARQUET = auto() - METRICS_JSONLINES = auto() - JSONLINES = auto() - EXCEL = auto() - LIBSVM = auto() ## Example datasets: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ - ## Binary - PICKLE = auto() - BIN = auto() - ## Algorithm: - FASTTEXT = auto() - BLAZINGTEXT = auto() - VOWPALWABBIT = auto() - XGBOOST = auto() - TFRECORD = auto() - ## Image: - PNG = auto() - JPEG = auto() - TIFF = auto() - BMP = auto() - GIF = auto() - ICO = auto() - WEBP = auto() - SVG = auto() - ## Document: - PDF = auto() - # Embedding: - NPZ = auto() - PSD = auto() ## Adobe Photoshop - ## Compressed formats: - ZIP = auto() - ## Other: - PLAIN_TEXT = auto() - CUSTOM = auto() - - def is_binary_format(self): - return self in BINARY_FILE_FORMATS - - -class Storage(AutoEnum): - STREAM = auto() ## io.StringIO and io.BytesIO - LOCAL_FILE_SYSTEM = auto() ## /whatever/the/path - S3 = auto() ## s3:// - URL = auto() ## http://, https://, etc. - - -REMOTE_STORAGES: Set[Storage] = {Storage.S3, Storage.URL} - - -class FileContents(AutoEnum): - CONFIG = auto() - SCHEMA = auto() - AIW_SCHEMA = auto() - PICKLED_OBJECT = auto() - DATAFRAME = auto() - ASSET = auto() - LABEL_ENCODING_DATAFRAME = auto() - TRANSFORMATION_PIPELINE_ARTIFACTS_DIR = auto() - ALGORITHM_TRAIN_DATASET = auto() - ALGORITHM_INFERENCE_DATASET = auto() - ALGORITHM_PREDICTIONS_DATASET = auto() ## Serialized Predictions Format - METRICS_DATAFRAME = auto() - MODEL = auto() - TENSORFLOW_MODEL = auto() - PYTORCH_MODEL = auto() - - -FILE_FORMAT_TO_FILE_ENDING_MAP: Dict[FileFormat, Union[str, List[str]]] = { - ## Map of file formats to file endings. - ## If multiple valid file-endings exist for a format, mention them in decreasing order of preference. - ## Data formats: - ### CSV and TSV: - FileFormat.CSV: [".csv", ".csv.part"], - FileFormat.TSV: [".tsv", ".tsv.part"], - ### JSON and JSONLINES: - FileFormat.JSON: [".json", ".aiw_schema.json"], - FileFormat.JSONLINES: [".jsonl", ".jsonl.part", ".jsonlines.json", ".jsonlines"], - FileFormat.METRICS_JSONLINES: ".metrics.json", - ### YAML: - FileFormat.YAML: [".yaml", ".yml"], - ### Plain text: - FileFormat.PLAIN_TEXT: ".txt", - ### Parquet: - FileFormat.PARQUET: ".parquet", - ### Pickled Python objects: - FileFormat.PICKLE: ".pickle", ## Ref: https://docs.python.org/3.7/library/pickle.html#examples - ### Excel: - FileFormat.EXCEL: ".xlsx", - ### LIBSVM: - FileFormat.LIBSVM: ".libsvm", - ### Compressed: - FileFormat.ZIP: ".zip", - ## Image: - FileFormat.PNG: ".png", - FileFormat.JPEG: [".jpg", ".jpeg"], - FileFormat.TIFF: [".tif", ".tiff"], - FileFormat.BMP: ".bmp", - FileFormat.GIF: ".gif", - FileFormat.ICO: ".ico", - FileFormat.WEBP: ".webp", - FileFormat.SVG: ".svg", - ## Algorithm formats: - ### BlazingText: - FileFormat.BLAZINGTEXT: ".blazingtext.txt", - ### FastText: - FileFormat.FASTTEXT: ".fasttext.txt", - ### VowpalWabbit: - FileFormat.VOWPALWABBIT: ".vw.txt", - ### XGBoost: - FileFormat.XGBOOST: ".xgboost.libsvm", ## LIBSVM is used for XGB, CatBoost, LightGBM, etc. - ### TFRecord: - FileFormat.TFRECORD: ".tfrecord", - ## EmbeddingFormats: - ### NPZ: - FileFormat.NPZ: ".npz", -} - -FILE_ENDING_TO_FILE_FORMAT_MAP: Dict[str, FileFormat] = {} -for file_format, file_ending in FILE_FORMAT_TO_FILE_ENDING_MAP.items(): - for fe in as_list(file_ending): - if fe in FILE_ENDING_TO_FILE_FORMAT_MAP: - raise ValueError(f"Cannot have duplicate file-ending keys: {fe}") - FILE_ENDING_TO_FILE_FORMAT_MAP[fe] = file_format - -FILE_FORMAT_TO_CONTENT_TYPE_MAP: Dict[FileFormat, str] = { - FileFormat.CSV: "text/csv", - FileFormat.TSV: "text/tsv", - FileFormat.PARQUET: "application/parquet", - FileFormat.EXCEL: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ## Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types - FileFormat.JSON: "application/json", - FileFormat.JSONLINES: "application/jsonlines", - FileFormat.YAML: "application/x-yaml", - FileFormat.LIBSVM: "text/libsvm", - FileFormat.PICKLE: "application/octet-stream", ## Ref: https://stackoverflow.com/a/40433504 - FileFormat.TFRECORD: "application/x-tfexample", - FileFormat.PLAIN_TEXT: "text/plain", - FileFormat.ZIP: "application/zip", - FileFormat.PNG: "image/png", - FileFormat.JPEG: "image/jpeg", - FileFormat.TIFF: "image/tiff", - FileFormat.BMP: "image/bmp", - FileFormat.GIF: "image/gif", - FileFormat.ICO: "image/vnd.microsoft.icon", - FileFormat.WEBP: "image/webp", - FileFormat.SVG: "image/svg+xml", - ## Made-up algorithm content types: - FileFormat.BLAZINGTEXT: "application/blazingtext", - FileFormat.XGBOOST: "application/xgboost", - FileFormat.FASTTEXT: "application/fasttext", - FileFormat.VOWPALWABBIT: "application/vw", -} -CONTENT_TYPE_TO_FILE_FORMAT_MAP: Dict[str, FileFormat] = {} -for file_format, content_type in FILE_FORMAT_TO_CONTENT_TYPE_MAP.items(): - if content_type in CONTENT_TYPE_TO_FILE_FORMAT_MAP: - raise ValueError(f"Cannot have duplicate content-type keys: {content_type}") - CONTENT_TYPE_TO_FILE_FORMAT_MAP[content_type] = file_format - -BINARY_FILE_FORMATS: List[FileFormat] = [ - FileFormat.BIN, - FileFormat.PARQUET, - FileFormat.EXCEL, - FileFormat.PICKLE, - FileFormat.TFRECORD, -] - -CONFIG_FILE_FORMATS: List[FileFormat] = [ - FileFormat.JSON, - FileFormat.YAML, -] - -DATAFRAME_FILE_FORMATS: List[FileFormat] = [ - FileFormat.CSV, - FileFormat.TSV, - FileFormat.PARQUET, - FileFormat.EXCEL, - FileFormat.JSONLINES, -] diff --git a/src/fmcore/constants/MLConstants.py b/src/fmcore/constants/MLConstants.py deleted file mode 100755 index 54d8c53..0000000 --- a/src/fmcore/constants/MLConstants.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import * - -from fmcore.util import AutoEnum, auto - - -class Task(AutoEnum): - """ - A Task should only relate to the outputs, not the inputs! - E.g. "Image classification" is not a valid task type, it should just be "classification". - Within classification, output variation can be made, especially if the predictions and metrics are different. - E.g. binary, multi-class and multi-label classification can all be considered different tasks since they have - significantly different metrics. - """ - - ## Classification - BINARY_CLASSIFICATION = auto() - MULTI_CLASS_CLASSIFICATION = auto() - MULTI_LABEL_CLASSIFICATION = auto() - - ## Regression - REGRESSION = auto() - - ## Embedding - EMBEDDING = auto() - - NER = auto() - - ## Ranking & Retrieval - RETRIEVAL_CORPUS = auto() ## For Datasets - RANKING = auto() - RETRIEVAL = auto() - - ## Prompting-based techniques - NEXT_TOKEN_PREDICTION = auto() ## Core task - IN_CONTEXT_LEARNING = auto() ## Derived task - - ## Audio & Speech - TEXT_TO_SPEECH = auto() - - -TaskType = Task - -TaskOrStr = Union[Task, str] - - -class MLType(AutoEnum): - ## "Data" MLTypes: - BOOL = auto() - TEXT = auto() - CATEGORICAL = auto() - INT = auto() - FLOAT = auto() - VECTOR = auto() - SPARSE_VECTOR = auto() - TIMESTAMP = auto() - TENSOR = auto() - OBJECT = auto() - - ## "Asset" MLTypes: - DOCUMENT = auto() ## For .txt documents, PDFs, etc - IMAGE = auto() - AUDIO = auto() - VIDEO = auto() - - ## Schema MLTypes: - INDEX = auto() - GROUND_TRUTH = auto() - PREDICTED_LABEL = auto() - PREDICTED_PROBABILITY = auto() - PREDICTED = auto() - - ## Ground truth label(s): - GROUND_TRUTH_LABEL = auto() ## TODO: Delete this. - GROUND_TRUTH_LABEL_LIST = auto() - GROUND_TRUTH_LABEL_COMMA_SEPARATED = auto() - GROUND_TRUTH_LABEL_COMMA_SEPARATED_OR_LIST = auto() - ENCODED_LABEL = auto() - ENCODED_LABEL_LIST = auto() - ENCODED_LABEL_COMMA_SEPARATED = auto() - ENCODED_LABEL_COMMA_SEPARATED_OR_LIST = auto() - - ## Predicted label(s): - PREDICTED_LABEL_COMMA_SEPARATED_OR_LIST = auto() - ENCODED_PREDICTED_LABEL = auto() - - ## Predicted probability score(s): - PROBABILITY_SCORE = auto() - PROBABILITY_SCORE_COMMA_SEPERATED_OR_LIST = auto() - PREDICTED_CORRECT = auto() - PREDICTION_IS_CONFIDENT = auto() - ## Each element stores a list [predicted_label, predicted_score, is_confident]: - PREDICTED_LABEL_PREDICTED_SCORE_IS_CONFIDENT_VECTOR = auto() - - -DATA_ML_TYPES: Set[MLType] = { - MLType.BOOL, - MLType.TEXT, - MLType.CATEGORICAL, - MLType.INT, - MLType.FLOAT, - MLType.VECTOR, - MLType.SPARSE_VECTOR, - MLType.TIMESTAMP, - MLType.TENSOR, -} - -ASSET_ML_TYPES: Set[MLType] = { - MLType.DOCUMENT, - MLType.IMAGE, - MLType.AUDIO, - MLType.VIDEO, -} - -PREDICTED_ML_TYPES: Set[MLType] = { - MLType.PREDICTED, - MLType.PREDICTED_LABEL, - MLType.PREDICTED_PROBABILITY, -} - -GROUND_TRUTH_ML_TYPES: Set[MLType] = { - MLType.GROUND_TRUTH, - MLType.GROUND_TRUTH_LABEL, -} - -MLTypeSchema = Dict[str, MLType] - -MLTypeOrStr = Union[MLType, str] diff --git a/src/fmcore/constants/AlgorithmConstants.py b/src/fmcore/constants/_AlgorithmConstants.py similarity index 94% rename from src/fmcore/constants/AlgorithmConstants.py rename to src/fmcore/constants/_AlgorithmConstants.py index abc6a09..166c2e0 100644 --- a/src/fmcore/constants/AlgorithmConstants.py +++ b/src/fmcore/constants/_AlgorithmConstants.py @@ -1,4 +1,4 @@ -from fmcore.util import AutoEnum, alias, auto +from autoenum import AutoEnum, alias, auto K_FOLD_NAME_PREFIX = "fold_" diff --git a/src/fmcore/constants/MetricConstants.py b/src/fmcore/constants/_MetricConstants.py similarity index 80% rename from src/fmcore/constants/MetricConstants.py rename to src/fmcore/constants/_MetricConstants.py index 867a44e..1b28148 100644 --- a/src/fmcore/constants/MetricConstants.py +++ b/src/fmcore/constants/_MetricConstants.py @@ -1,6 +1,4 @@ -from typing import * - -from fmcore.util import AutoEnum, auto +from autoenum import AutoEnum, auto class ThresholdStrategy(AutoEnum): diff --git a/src/fmcore/constants/_TaskConstants.py b/src/fmcore/constants/_TaskConstants.py new file mode 100755 index 0000000..1ee6451 --- /dev/null +++ b/src/fmcore/constants/_TaskConstants.py @@ -0,0 +1,41 @@ +from typing import Union + +from autoenum import AutoEnum, auto + + +class Task(AutoEnum): + """ + A Task should only relate to the outputs, not the inputs! + E.g. "Image classification" is not a valid task type, it should just be "classification". + Within classification, output variation can be made, especially if the predictions and metrics are different. + E.g. binary, multi-class and multi-label classification can all be considered different tasks since they have + significantly different metrics. + """ + + ## Classification + BINARY_CLASSIFICATION = auto() + MULTI_CLASS_CLASSIFICATION = auto() + MULTI_LABEL_CLASSIFICATION = auto() + + ## Regression + REGRESSION = auto() + + ## Embedding + EMBEDDING = auto() + + NER = auto() + + ## Ranking & Retrieval + RETRIEVAL_CORPUS = auto() ## For Datasets + RANKING = auto() + RETRIEVAL = auto() + + ## Prompting-based techniques + NEXT_TOKEN_PREDICTION = auto() ## Core task + IN_CONTEXT_LEARNING = auto() ## Derived task + + ## Audio & Speech + TEXT_TO_SPEECH = auto() + + +TaskOrStr = Union[Task, str] diff --git a/src/fmcore/constants/VisualizationConstants.py b/src/fmcore/constants/_VisualizationConstants.py similarity index 89% rename from src/fmcore/constants/VisualizationConstants.py rename to src/fmcore/constants/_VisualizationConstants.py index ee62746..311f058 100644 --- a/src/fmcore/constants/VisualizationConstants.py +++ b/src/fmcore/constants/_VisualizationConstants.py @@ -1,7 +1,12 @@ from importlib import import_module -from typing import * +from typing import ( + Dict, + List, + Set, +) -from fmcore.util import AutoEnum, auto, optional_dependency +from autoenum import AutoEnum, auto +from bears.util import optional_dependency class VisualizationBackend(AutoEnum): diff --git a/src/fmcore/constants/__init__.py b/src/fmcore/constants/__init__.py index 2c56063..5b540d6 100644 --- a/src/fmcore/constants/__init__.py +++ b/src/fmcore/constants/__init__.py @@ -1,7 +1,5 @@ -from fmcore.constants.DataProcessingConstants import * -from fmcore.constants.MLConstants import * -from fmcore.constants.FileConstants import * -from fmcore.constants.AlgorithmConstants import * -from fmcore.constants.MetricConstants import * -from fmcore.constants.VisualizationConstants import * -_LIBRARY_NAME: str = 'fmcore' +from bears.constants import * +from fmcore.constants._TaskConstants import * +from fmcore.constants._AlgorithmConstants import * +from fmcore.constants._MetricConstants import * +from fmcore.constants._VisualizationConstants import * \ No newline at end of file diff --git a/src/fmcore/data/FileMetadata.py b/src/fmcore/data/FileMetadata.py deleted file mode 100644 index 95092f3..0000000 --- a/src/fmcore/data/FileMetadata.py +++ /dev/null @@ -1,352 +0,0 @@ -import io -import os -import pathlib -import tempfile -from typing import * - -import requests -from pydantic import constr, root_validator - -from fmcore.constants import ( - FILE_ENDING_TO_FILE_FORMAT_MAP, - REMOTE_STORAGES, - FileContents, - FileFormat, - MLTypeSchema, - Storage, -) -from fmcore.util import Alias, FileSystemUtil, Parameters, String, safe_validate_arguments -from fmcore.util.aws import S3Util - -FileMetadata = "FileMetadata" - - -class FileMetadata(Parameters): - name: Optional[constr(min_length=1, max_length=63, strip_whitespace=True)] - path: Union[constr(min_length=1, max_length=1023), Any] - storage: Optional[Storage] - format: Optional[FileFormat] - contents: Optional[FileContents] - file_glob: Optional[str] - data_schema: Optional[MLTypeSchema] - - @classmethod - def of(cls, path: Union[io.IOBase, FileMetadata, Dict, str], **kwargs) -> "FileMetadata": - if isinstance(path, FileMetadata): - path: Dict = path.dict(exclude=None) - elif isinstance(path, (str, pathlib.Path)): - path: Dict = dict(path=str(path)) - elif isinstance(path, io.IOBase): - path: Dict = dict(path=path) - assert isinstance(path, dict) - path: Dict = {**path, **kwargs} - return FileMetadata(**path) - - @root_validator(pre=True) - def set_params(cls, params: Dict): - Alias.set_format(params) - if isinstance(params["path"], pathlib.Path): - params["path"]: str = str(params["path"]) - if isinstance(params["path"], str) and params["path"].startswith("~"): - params["path"]: str = FileSystemUtil.expand_dir(params["path"]) - - if "storage" not in params: - params["storage"]: Storage = cls.detect_storage(params["path"]) - if params["storage"] is Storage.STREAM: - raise ValueError("Storage cannot be a stream.") - elif params["storage"] is Storage.LOCAL_FILE_SYSTEM: - params["path"]: str = FileSystemUtil.expand_dir(params["path"]) - - if "format" not in params: - format: Optional[FileFormat] = cls.detect_file_format(params["path"], raise_error=False) - if format is not None: - params["format"] = format - return params - - def is_remote_storage(self, remote_storages: Tuple[Storage, ...] = tuple(REMOTE_STORAGES)) -> bool: - return self.storage in remote_storages - - @classmethod - @safe_validate_arguments - def detect_storage( - cls, path: Union[io.IOBase, constr(min_length=1, max_length=1023)] - ) -> Optional[Storage]: - if isinstance(path, io.IOBase) and hasattr(path, "read"): - return Storage.STREAM - elif isinstance(path, str): - if path.startswith(String.HTTP_PREFIX) or path.startswith(String.HTTPS_PREFIX): - return Storage.URL - if S3Util.is_valid_s3_path(path): - return Storage.S3 - return Storage.LOCAL_FILE_SYSTEM - return None - - @classmethod - @safe_validate_arguments - def detect_file_ending( - cls, - file_path: constr(min_length=1, max_length=1023), - raise_error: bool = True, - ) -> Optional[str]: - if FileSystemUtil.is_path_valid_dir(file_path) or S3Util.is_path_valid_s3_dir(file_path): - if raise_error: - raise ValueError(f"Cannot detect file ending of directory: {file_path}") - return None - ## Works for both local and S3 paths: - file_ending: List = pathlib.Path(str(file_path)).suffixes - if len(file_ending) == 0: - return None - return "".join(file_ending) - - @classmethod - def path_exists(cls, path: Any) -> bool: - if not isinstance(path, (str, pathlib.Path)): - return False - path: str = str(path) - storage: Storage = cls.detect_storage(path) - if storage is Storage.LOCAL_FILE_SYSTEM: - return FileSystemUtil.exists(path) - elif storage is Storage.S3: - return S3Util.s3_object_exists(path) - elif storage is Storage.URL: - return str(requests.head(path).status_code).startswith("2") - raise NotImplementedError(f'Cannot determine whether following path exists on {storage}: "{path}"') - - def exists(self) -> bool: - return self.path_exists(path=self.path) - - @classmethod - @safe_validate_arguments - def detect_file_format( - cls, - file_path: constr(min_length=1, max_length=1023), - raise_error: bool = True, - ) -> Optional[FileFormat]: - if FileSystemUtil.is_path_valid_dir(file_path) or S3Util.is_path_valid_s3_dir(file_path): - if raise_error: - raise ValueError(f"Cannot detect file format of directory: {file_path}") - return None - fpath_stripped: str = str(file_path).rstrip() - matched_file_endings_longest_first: List[Tuple[str, FileFormat]] = sorted( - [ - (file_ending, file_format) - for file_ending, file_format in FILE_ENDING_TO_FILE_FORMAT_MAP.items() - if fpath_stripped.endswith(file_ending) - ], - key=lambda x: len(x[0]), - reverse=True, - ) - if len(matched_file_endings_longest_first) == 0: - if raise_error: - raise ValueError(f'No matching file format found for file with path: "{file_path}"') - return None - return matched_file_endings_longest_first[0][1] - - @safe_validate_arguments - def open(self, file_name: Optional[str] = None, mode: Optional[str] = None, tmpdir: Optional[str] = None): - if self.is_path_valid_dir() and file_name is None: - raise ValueError( - f"When the path is a directory, you must pass `file_name` to {self.class_name}.open(...)" - ) - elif not self.is_path_valid_dir() and file_name is not None: - raise ValueError( - f"When the file metadata path is a file, you must not pass `file_name` " - f"to {self.class_name}.open(...)" - ) - if self.storage is Storage.LOCAL_FILE_SYSTEM: - if self.is_path_valid_dir(): - assert file_name is not None - local_file: FileMetadata = self.file_in_dir(file_name, return_metadata=True) - else: - assert file_name is None - local_file: FileMetadata = self - elif self.is_remote_storage(): - if self.is_path_valid_dir() and file_name is not None: - remote_file: FileMetadata = self.file_in_dir(file_name, return_metadata=True) - if tmpdir is None: - tmpdir: str = tempfile.TemporaryDirectory().name - temp_local_dir: FileMetadata = FileMetadata.of(tmpdir).mkdir(return_metadata=True) - temp_local_file: FileMetadata = temp_local_dir.file_in_dir(file_name, return_metadata=True) - if remote_file.storage is Storage.S3: - if not S3Util.copy_s3_file_to_local( - source_s3_path=remote_file.path, - destination_local_path=temp_local_file.path, - ): - raise OSError( - f'Cannot download file from "{remote_file.path}" to "{temp_local_file.path}"' - ) - else: - raise NotImplementedError(f"Can only load from S3, not {self.storage}") - if not temp_local_file.exists(): - raise OSError(f'No such file on {temp_local_file.storage}: "{temp_local_file}"') - local_file: FileMetadata = temp_local_file - else: - raise NotImplementedError(f'Cannot yet open a folder from remote location: "{self.path}"') - else: - raise NotImplementedError(f"Cannot open storage: {self.storage}") - return io.open(local_file.path, mode=mode) - - def get_dir(self, return_metadata: bool = False) -> Union[FileMetadata, str]: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - dir_path: str = FileSystemUtil.get_dir(self.path) - elif self.storage is Storage.S3: - dir_path: str = S3Util.get_s3_dir(self.path) - else: - raise NotImplementedError(f"Cannot get dir for path on {self.storage} storage.") - if return_metadata: - return self.update_params(path=dir_path) - return dir_path - - def file_in_dir( - self, - path: str, - return_metadata: bool = False, - touch: bool = False, - **kwargs, - ) -> Union[FileMetadata, str]: - file_in_dir: str = self.path_in_dir(path, is_dir=False, **kwargs) - if touch: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - FileSystemUtil.touch_file(file_in_dir) - elif self.storage is Storage.S3: - S3Util.touch_s3_object(file_in_dir) - else: - raise ValueError(f"Cannot touch file on {self.storage} storage.") - if return_metadata: - return self.update_params(path=file_in_dir) - return file_in_dir - - def __truediv__(self, subdir_name: str) -> FileMetadata: - assert isinstance(subdir_name, str) - return self.subdir_in_dir( - subdir_name, - return_metadata=True, - ) - - def subdir_in_dir( - self, - path: Optional[str], - *, - mkdir: bool = True, - return_dir_on_none: bool = False, - raise_error: bool = True, - return_metadata: bool = False, - **kwargs, - ) -> Union[FileMetadata, str]: - if path is None and return_dir_on_none: - if return_metadata: - return self.copy() - return self.path - subdir_path: str = self.path_in_dir(path, is_dir=True, **kwargs) - if mkdir: - FileMetadata(path=subdir_path, **self.dict(exclude={"path"})).mkdir(raise_error=raise_error) - if return_metadata: - return self.update_params(path=subdir_path) - return subdir_path - - def path_in_dir(self, path: str, is_dir: bool, **kwargs) -> str: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - return FileSystemUtil.construct_path_in_dir(self.path, path, is_dir=is_dir, **kwargs) - elif self.storage is Storage.S3: - if not S3Util.is_valid_s3_path(self.path): - raise ValueError( - f'Cannot create path of file/subdir with name "{path}" in directory; ' - f'base directory path "{self.path}" is invalid.' - ) - return S3Util.construct_path_in_s3_dir(self.path, path, is_dir=is_dir, **kwargs) - raise NotImplementedError( - f'Cannot create path {path} in dir "{self.path}" for storage: {self.storage}' - ) - - def mkdir(self, raise_error: bool = True, return_metadata: bool = False) -> Union[FileMetadata, str]: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - path: str = self.path - if not path.endswith(os.path.sep): - path += os.path.sep - FileSystemUtil.mkdir_if_does_not_exist(path, raise_error=raise_error) - if return_metadata: - return self - return path - elif self.storage is Storage.URL: - raise ValueError(f"Cannot create a directory at URL: {self.path}") - elif self.storage is Storage.STREAM: - raise ValueError("Cannot create a directory for a stream.") - elif self.storage is Storage.S3: - path: str = self.path - if not path.endswith(String.SLASH): - path += String.SLASH - if return_metadata: - return self - return path ## Do nothing, S3 dirs do not need to be created. - raise NotImplementedError(f"Unsupported storage: {self.storage}") - - def mksubdir(self, subdir_name: str, raise_error: bool = True) -> bool: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - if not FileSystemUtil.is_path_valid_dir(self.path): - raise ValueError( - f'Cannot create subdirectory with name "{subdir_name}" in directory; ' - f'directory path "{self.path}" is invalid.' - ) - return FileSystemUtil.mkdir_if_does_not_exist( - self.subdir_in_dir(subdir_name), - raise_error=raise_error, - ) - elif self.storage is Storage.URL: - raise ValueError(f"Cannot create a directory at URL: {self.path}") - elif self.storage is Storage.STREAM: - raise ValueError("Cannot create a directory for a stream.") - elif self.storage is Storage.S3: - return True ## Do nothing, S3 dirs do not need to be created. - raise NotImplementedError(f"Unsupported storage: {self.storage}") - - def is_path_valid_dir(self) -> bool: - if self.storage is Storage.LOCAL_FILE_SYSTEM: - return FileSystemUtil.is_path_valid_dir(self.path) - elif self.storage is Storage.S3: - return S3Util.is_path_valid_s3_dir(self.path) - elif self.storage is Storage.URL: - return self.path.endswith(String.SLASH) - return False - - def list(self, **kwargs) -> List[str]: - if not self.is_path_valid_dir(): - raise ValueError(f'Path "{self.path}" is not a valid directory.') - if self.file_glob is not None: - kwargs.setdefault("file_glob", self.file_glob) - if self.storage is Storage.LOCAL_FILE_SYSTEM: - return FileSystemUtil.list(self.path, **kwargs) - elif self.storage is Storage.S3: - return S3Util.list(self.path, **kwargs) - raise ValueError(f"Cannot list files in {self.storage} path: {self.path}") - - def list_metadata(self, **kwargs) -> List[FileMetadata]: - if not self.is_path_valid_dir(): - raise ValueError(f'Path "{self.path}" is not a valid directory.') - files_metadata: List[FileMetadata] = [ - self.update_params( - name=None, - path=fpath, - file_glob=None, - ) - for fpath in self.list(**kwargs) - ] - return files_metadata - - def copy_to_dir(self, destination: Union[FileMetadata, Dict, str]) -> bool: - if self.is_path_valid_dir() is False: - raise ValueError(f'Source path is not a valid directory: "{self.path}"') - destination: FileMetadata = FileMetadata.of(destination) - if destination.is_path_valid_dir() is False: - raise ValueError(f'Destination path is not a valid directory: "{destination.path}"') - if self.storage is Storage.LOCAL_FILE_SYSTEM and destination.storage is Storage.LOCAL_FILE_SYSTEM: - return FileSystemUtil.copy_dir(self.path, destination.path) - elif self.storage is Storage.LOCAL_FILE_SYSTEM and destination.storage is Storage.S3: - return S3Util.copy_local_dir_to_s3(self.path, destination.path) - elif self.storage is Storage.S3 and destination.storage is Storage.LOCAL_FILE_SYSTEM: - return S3Util.copy_s3_dir_to_local(self.path, destination.path) - elif self.storage is Storage.S3 and destination.storage is Storage.S3: - return S3Util.copy_dir_between_s3_locations(self.path, destination.path) - raise NotImplementedError( - f'Copying from source storage "{self.storage}" to destination storage "{destination.storage}" ' - f"is not yet supported." - ) diff --git a/src/fmcore/data/__init__.py b/src/fmcore/data/__init__.py index ba584bd..b7bc779 100644 --- a/src/fmcore/data/__init__.py +++ b/src/fmcore/data/__init__.py @@ -1,7 +1,2 @@ -from fmcore.data.sdf.ScalableSeries import * -from fmcore.data.sdf.ScalableDataFrame import * -from fmcore.data.FileMetadata import * -from fmcore.data.asset import * -from fmcore.data.reader.Reader import * -from fmcore.data.writer.Writer import * -from fmcore.data.pipeline import * +from fmcore.data.reader import * +from fmcore.data.writer import * \ No newline at end of file diff --git a/src/fmcore/data/asset.py b/src/fmcore/data/asset.py deleted file mode 100644 index 433cbea..0000000 --- a/src/fmcore/data/asset.py +++ /dev/null @@ -1,145 +0,0 @@ -from abc import ABC -from typing import * - -import numpy as np -from pydantic import conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import ( - AVAILABLE_TENSOR_TYPES, - SHORTHAND_TO_TENSOR_LAYOUT_MAP, - TENSOR_LAYOUT_TO_SHORTHAND_MAP, - DataLayout, - MLType, -) -from fmcore.data.FileMetadata import FileMetadata -from fmcore.util import Parameters, Registry, String, optional_dependency, type_str - - -class Asset(Parameters, Registry, ABC): - _allow_subclass_override = True - - mltype: ClassVar[MLType] - path: Optional[Union[FileMetadata, str]] = None - data: Any - layout: DataLayout - - @root_validator(pre=True) - def validate_params(cls, params: Dict) -> Dict: - params["layout"]: DataLayout = cls.detect_layout(params["data"]) - return params - - @classmethod - def detect_layout(cls, data: Any, raise_error: bool = True) -> Optional[DataLayout]: - for layout, dtype in AVAILABLE_TENSOR_TYPES.items(): - if isinstance(data, dtype): - return layout - if raise_error: - raise ValueError(f"Cannot detect layout for data of type {type_str(data)}.") - return None - - @classmethod - def _registry_keys(cls) -> MLType: - return cls.mltype - - def as_tensor(self, tensor_type_or_layout: Union[DataLayout, str], **kwargs) -> Optional[Any]: - tensor_layout: DataLayout = tensor_type_or_layout - if not isinstance(tensor_layout, DataLayout): - if DataLayout.matches_any(tensor_layout): - tensor_layout: DataLayout = DataLayout.from_str(tensor_layout) - else: - tensor_layout: DataLayout = SHORTHAND_TO_TENSOR_LAYOUT_MAP[ - String.str_normalize(tensor_type_or_layout) - ] - if tensor_layout not in TENSOR_LAYOUT_TO_SHORTHAND_MAP: - raise ValueError( - f"Argument `tensor_type_or_layout`: {tensor_layout} is not a valid tensor layout. " - f"supported tensor layouts: {list(TENSOR_LAYOUT_TO_SHORTHAND_MAP.values())}" - ) - if tensor_layout not in AVAILABLE_TENSOR_TYPES: - raise ValueError( - f"Corresponding package has not been installed for argument `tensor_type_or_layout`: {tensor_layout}`; " - f"available packages: {list(AVAILABLE_TENSOR_TYPES.keys())}" - ) - if tensor_layout is DataLayout.NUMPY: - return self.numpy(**kwargs) - if tensor_layout is DataLayout.TORCH: - return self.torch(**kwargs) - raise NotImplementedError(f"Unsupported value of `tensor_type_or_layout`: {tensor_type_or_layout}") - - def numpy(self, error: Literal["raise", "warn", "ignore"] = "raise", **kwargs) -> Optional[Any]: - if self.layout is DataLayout.NUMPY: - return self.data - if self.layout is DataLayout.TORCH: - return self.data.cpu().numpy() - if error == "raise": - pass - return None - - def torch(self, error: Literal["raise", "warn", "ignore"] = "raise", **kwargs) -> Optional[Any]: - if self.layout is DataLayout.NUMPY: - import torch - - return torch.from_numpy(self.data) - if self.layout is DataLayout.TORCH: - return self.data - if error == "raise": - pass - return None - - -class Image(Asset): - mltype = MLType.IMAGE - - height: conint(ge=1) - width: conint(ge=1) - color_mode: Literal["G", "RGB", "BRG"] - channels: Literal["first", "last"] - - def to_pil_image(self) -> Optional[Any]: - img: np.ndarray = self.to_channels_last().numpy() - with optional_dependency("torchvision", "PIL"): - from PIL import Image as PILImage - from torchvision.transforms.functional import to_pil_image - - img: PILImage = to_pil_image(img) - return img - return None - - def to_channels_first(self) -> Asset: - if self.channels == "first": - return self - if self.layout is DataLayout.NUMPY: - moveaxis: Callable = np.moveaxis - elif self.layout is DataLayout.TORCH: - moveaxis: Callable = torch.moveaxis - else: - raise NotImplementedError() - img = moveaxis(self.data, -1, 0) - return Image( - data=img, - channels="first", - **self.dict(exclude={"data", "channels"}), - ) - - def to_channels_last(self) -> Asset: - if self.channels == "last": - return self - if self.layout is DataLayout.NUMPY: - moveaxis: Callable = np.moveaxis - elif self.layout is DataLayout.TORCH: - moveaxis: Callable = torch.moveaxis - else: - raise NotImplementedError() - img = moveaxis(self.data, 0, -1) - return Image( - data=img, - channels="last", - **self.dict(exclude={"data", "channels"}), - ) - - -class Audio(Asset): - mltype = MLType.AUDIO - - sampling_rate: conint(ge=1) diff --git a/src/fmcore/data/pipeline.py b/src/fmcore/data/pipeline.py deleted file mode 100644 index 717a8e6..0000000 --- a/src/fmcore/data/pipeline.py +++ /dev/null @@ -1,1130 +0,0 @@ -import copy -import io -import json -import time -from abc import ABC, abstractmethod -from collections import OrderedDict -from math import inf -from typing import * - -import cloudpickle -import numpy as np -from pydantic import confloat, conint, constr, root_validator - -from fmcore.constants import FileContents, MissingColumnBehavior, MLType, MLTypeSchema, ProcessingMode -from fmcore.data.FileMetadata import FileMetadata -from fmcore.data.processor import DataProcessor, Nto1ColumnProcessor, SingleColumnProcessor -from fmcore.data.reader import ConfigReader, Reader -from fmcore.data.sdf import DataLayout, ScalableDataFrame, ScalableDataFrameRawType -from fmcore.data.writer import DataFrameWriter, Writer -from fmcore.util import ( - AutoEnum, - FractionalBool, - Log, - Parameters, - Registry, - String, - UserEnteredParameters, - as_list, - auto, - filter_string_list, - get_subset, - is_subset, - keep_keys, - keep_values, - measure_time_ms, - safe_validate_arguments, - type_str, -) - -AlgorithmDatasetWriter = "AlgorithmDatasetWriter" -DataProcessingPipeline = "DataProcessingPipeline" -DataProcessingPipelineStep = "DataProcessingPipelineStep" -DataProcessingPipelineStepProcessor = "DataProcessingPipelineStepProcessor" - -PipelineWriter = Union[DataFrameWriter, AlgorithmDatasetWriter] - - -class PersistLevel(AutoEnum): - DONT_PERSIST = auto() - BEFORE_PIPELINE = auto() - AFTER_PIPELINE = auto() - BEFORE_AFTER_PIPELINE = auto() - EVERY_PIPELINE_STEP = auto() - EVERY_PROCESSOR = auto() - - -class ProcessorPerf(Parameters): - start_time: confloat(ge=0.0) - processing_mode: ProcessingMode - input_columns: List[str] - output_columns: List[str] - data_processor_class_name: str - data_processor_params: Dict - persist_time_ms: Optional[confloat(ge=0.0)] - end_time: confloat(ge=0.0) - time_ms: Optional[confloat(ge=0.0)] - - @root_validator(pre=True) - def set_time_ms(cls, params): - params["time_ms"] = 1000 * (params["end_time"] - params["start_time"]) - return params - - -class PipelineStepPerf(Parameters): - start_time: confloat(ge=0.0) - processing_mode: ProcessingMode - num_rows_processed: Optional[conint(ge=1)] - length_calculation_ms: Optional[confloat(ge=0.0)] - processor_perfs: List[ProcessorPerf] - persist_time_ms: Optional[confloat(ge=0.0)] - end_time: confloat(ge=0.0) - time_ms: Optional[confloat(ge=0.0)] - - @root_validator(pre=True) - def set_time_ms(cls, params): - params["time_ms"] = 1000 * (params["end_time"] - params["start_time"]) - return params - - -class PipelineWriterPerf(Parameters): - start_time: confloat(ge=0.0) - input_columns: List[str] - writer_class_name: str - writer_params: Dict - end_time: confloat(ge=0.0) - time_ms: Optional[confloat(ge=0.0)] - - @root_validator(pre=True) - def set_time_ms(cls, params): - params["time_ms"] = 1000 * (params["end_time"] - params["start_time"]) - return params - - -class ProcessingPipelinePerf(Parameters): - processing_mode: ProcessingMode - persist: PersistLevel - is_input_ScalableDataFrame: bool - should_log_perf: bool - - start_time: confloat(ge=0.0) - layout_detection_time_ms: confloat(ge=0.0) - input_data_layout: DataLayout - - persist_read_time_ms: Optional[confloat(ge=0.0)] - - length_calculation_ms: Optional[confloat(ge=0.0)] - process_as: DataLayout - layout_conversion_ms: confloat(ge=0.0) - - pipeline_steps_compute_time_ms: Optional[confloat(ge=0.0)] - pipeline_step_perfs: Optional[List[PipelineStepPerf]] - persist_compute_time_ms: Optional[confloat(ge=0.0)] - - pipeline_write_time_ms: Optional[confloat(ge=0.0)] - pipeline_writer_perfs: Optional[List[PipelineWriterPerf]] - - num_rows_processed: conint(ge=1) - - end_time: confloat(ge=0.0) - time_ms: Optional[confloat(ge=0.0)] - - @root_validator(pre=True) - def set_time_ms(cls, params): - params["time_ms"] = 1000 * (params["end_time"] - params["start_time"]) - return params - - -class DataProcessingPipelineConfig(UserEnteredParameters): - """Structure in YAML file.""" - - class StepConfig(UserEnteredParameters): - input: Union[List[Union[MLType, str]], MLType, str] - output: constr(min_length=1, strip_whitespace=True) = "{col_name}" - params: Optional[Dict[str, Any]] = None - transformer: constr(min_length=1, strip_whitespace=True) ## Data Processor name - - class WriterConfig(UserEnteredParameters): - input: Union[List[Union[MLType, str]], MLType, str] - writer: constr(min_length=1, strip_whitespace=True) - params: Dict[str, Any] = {} - schema_override: MLTypeSchema = {} - - pipeline: List[StepConfig] = [] - writers_config: List[WriterConfig] = [] - - -class DataProcessingPipelineStepProcessor(Parameters, Registry, ABC): - data_processor_class: ClassVar[Type[DataProcessor]] - data_processor: DataProcessor - output_col_name: str - output_mltype: MLType - - @classmethod - def _registry_keys(cls) -> Optional[Union[List[Any], Any]]: - return [cls.data_processor_class, cls.data_processor_class.__name__] - - @classmethod - @abstractmethod - def create_pipeline_step_processors( - cls, - DataProcessorClass: Type[DataProcessor], - filtered_input_schema: MLTypeSchema, - name: str, - params: Dict, - output_pattern: str, - ) -> Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor]: - """ - Static factory to create a mapping from input column(s) to data processor instances and their output. - :param filtered_input_schema: input schema with only the relevant columns which we must transform. - Each key is a column name from the input data, and each value is its corresponding MLType. - :param name: name of the data processor(s). - :param params: dict of params to initialize the data processor(s). - :param output_pattern: used to name the output columns. - :return: Depending on the type of processor (1:1, N:1, etc), the returned map will have each key as a single - column or a tuple of columns. The value is the data processor instance which will transform a single column or - set of columns, respectively. - E.g. for 1:1 we might return: - { - "ASIN_STATIC_ITEM_NAME": - (<__TFIDFVectorization_at_87da792f>, 'ASIN_STATIC_ITEM_NAME_TFIDF_15000', MLType.VECTOR), - "ASIN_STATIC_BULLET_POINT": - (<__TFIDFVectorization_at_adf90eb8>, 'ASIN_STATIC_BULLET_POINT_TFIDF_15000', MLType.VECTOR) - } - E.g. for N:1 we might return: - { - ("ASIN_STATIC_ITEM_NAME", "ASIN_STATIC_BULLET_POINT"): - (<__TextConcatenation_at_92ba33e>, 'CONCATENATED_TEXT_COLUMNS', MLType.TEXT) - } - """ - pass - - @classmethod - @abstractmethod - def get_pipeline_step_output_schema( - cls, - input_schema: MLTypeSchema, - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor], - ) -> MLTypeSchema: - """ - Obtains the output schema from the input data processors dict. - :param input_schema: schema with all columns in the current DataFrame. - :param pipeline_step_processors: map from column(s) which must be transformed, to data processor - and its outputs. This should be the output of a call to `create_pipeline_step_processors`. - :return: the updated output schema. Columns which are not in `pipeline_step_processors` are copied as-is. - For other columns this function will use the output column names and MLTypes in `pipeline_step_processors` - to add the corresponding columns to the input schema. This becomes the output schema which is returned. - """ - pass - - -class DataProcessingPipelineStepSingleColumnProcessor(DataProcessingPipelineStepProcessor): - data_processor_class: ClassVar[Type[DataProcessor]] = SingleColumnProcessor - - @classmethod - def create_pipeline_step_processors( - cls, - DataProcessorClass: Type[SingleColumnProcessor], - filtered_input_schema: MLTypeSchema, - name: str, - params: Dict, - output_pattern: str, - ) -> Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor]: - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor] = {} - for input_col, input_mltype in filtered_input_schema.items(): - processor_input_schema: MLTypeSchema = {input_col: input_mltype} - data_processor: SingleColumnProcessor = DataProcessorClass( - name=name, - data_schema=processor_input_schema, - params=params, - ) - supported_input_mltypes: Tuple[MLType] = data_processor.input_mltypes - assert input_mltype in supported_input_mltypes, ( - f'"{str(input_mltype)}" not included in supported MLTypes: "{str(supported_input_mltypes)}"' - ) - ## For 1:1 data processors, the supported input MLType should be a list of MLTypes - if not all([isinstance(mltype, MLType) for mltype in supported_input_mltypes]): - raise AttributeError( - f"Supported input types for class {str(cls)} (1:1 data processor) " - + f"should be a list of MLTypes, not: {supported_input_mltypes}" - ) - ## Converts '{col_name}_XYZ' to 'MyCol_XYZ' but leaves 'XYZ' unchanged. - output_col_name = output_pattern.format(col_name=input_col) - output_mltype = data_processor.output_mltype - pipeline_step_processors[input_col] = cls( - data_processor=data_processor, output_col_name=output_col_name, output_mltype=output_mltype - ) - return pipeline_step_processors - - @classmethod - def get_pipeline_step_output_schema( - cls, - input_schema: MLTypeSchema, - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor], - ) -> MLTypeSchema: - output_schema = copy.deepcopy(input_schema) - for input_cols, step_processor in pipeline_step_processors.items(): - if step_processor.output_col_name is not None and step_processor.output_mltype is not None: - output_schema[step_processor.output_col_name] = step_processor.output_mltype - return output_schema - - -class DataProcessingPipelineStepNto1ColumnProcessor(DataProcessingPipelineStepProcessor): - data_processor_class: ClassVar[Type[DataProcessor]] = Nto1ColumnProcessor - - @classmethod - def create_pipeline_step_processors( - cls, - DataProcessorClass: Type[Nto1ColumnProcessor], - filtered_input_schema: MLTypeSchema, - name: str, - params: Dict, - output_pattern: str, - ) -> Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor]: - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor] = {} - ## Sorted tuple of columns we want to pass to this data processor. - input_cols: Tuple[str] = tuple(filtered_input_schema.keys()) - if len(input_cols) > 0: - processor: Nto1ColumnProcessor = DataProcessorClass( - name=name, - data_schema=copy.deepcopy(filtered_input_schema), - params=params, - ) - supported_input_mltypes: Tuple[MLType, ...] = processor.input_mltypes - ## For N:1 data processors, the supported input MLType should be a list of MLTypes - if not all([isinstance(mltype, MLType) for mltype in supported_input_mltypes]): - raise AttributeError( - f"Supported input types for {str(cls)} (N:1 data processor) " - + f"should be a list of MLTypes, not: {supported_input_mltypes}" - ) - if not all([mltype in supported_input_mltypes for mltype in filtered_input_schema.values()]): - raise AttributeError( - f"MLTypes of selected columns passed to {str(cls)} (N:1 data processor) " - + "should be supported by this data processor. Supported types are " - + f"{supported_input_mltypes}, selected columns have MLTypes: " - + f"{list(filtered_input_schema.values())}" - ) - output_col_name = output_pattern ## Assume it does not have {col_name} in it. - output_mltype: MLType = processor.output_mltype - pipeline_step_processors[input_cols] = cls( - data_processor=processor, - output_col_name=output_col_name, - output_mltype=output_mltype, - ) - return pipeline_step_processors - - @classmethod - def get_pipeline_step_output_schema( - cls, - input_schema: MLTypeSchema, - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor], - ) -> MLTypeSchema: - output_schema = copy.deepcopy(input_schema) - ## dict returned by create_pipeline_step_processors should have exactly one item. - assert len(pipeline_step_processors) <= 1 - for input_cols, step_processor in pipeline_step_processors.items(): - if step_processor.output_col_name is not None and step_processor.output_mltype is not None: - output_schema[step_processor.output_col_name] = step_processor.output_mltype - return output_schema - - -class DataProcessingPipelineStep(Parameters): - input_schema: MLTypeSchema - pipeline_step_processors: Dict[Union[str, Tuple], DataProcessingPipelineStepProcessor] - output_schema: MLTypeSchema - - def __str__(self): - out_str = f"{self.__class__.__name__}:" - out_str += "\n >> Input schema: " + str(MLType.convert_values_to_str(self.input_schema)) - out_str += "\n >> Data Processors map:" - for cols_to_transform, step_processor in self.pipeline_step_processors.items(): - out_str += f"\n - Columns to transform: {str(cols_to_transform)}" - out_str += f"\n Data Processor: {step_processor.data_processor.class_name}" - if len(step_processor.data_processor.params.dict()) > 0: - out_str += f" ({step_processor.data_processor.params})" - out_str += f"\n Output column: {str(step_processor.output_col_name)} ({str(step_processor.output_mltype)})" - out_str += "\n >> Output schema: " + str(MLType.convert_values_to_str(self.output_schema)) - return out_str - - @classmethod - @safe_validate_arguments - def from_config( - cls, - step_cfg: DataProcessingPipelineConfig.StepConfig, - step_input_schema: MLTypeSchema, - ) -> Any: - """ - Static factory to resolve and instantiate a pipeline step object. - Resolution includes: - - Add filtered input schema to the pipeline step - - Add a collection of data processors to the pipeline step - - Add an output schema to the pipeline step - :param step_cfg: pipeline step configuration. - :param step_input_schema: the schema of the DataFrame at this step. - :return: Serializable DataProcessingPipelineStep instance. - """ - ## Extract variables: - DataProcessorClass: Type[DataProcessor] = DataProcessor.get_subclass(step_cfg.transformer) - if issubclass(DataProcessorClass, SingleColumnProcessor): - DataProcessorSuperClass: Type[DataProcessor] = SingleColumnProcessor - elif issubclass(DataProcessorClass, Nto1ColumnProcessor): - DataProcessorSuperClass: Type[DataProcessor] = Nto1ColumnProcessor - else: - raise NotImplementedError( - f"Unsupported subtype of {DataProcessor}: {DataProcessorClass}, " - f"with following inheritance: {DataProcessorClass.__mro__}" - ) - - DataProcessingPipelineStepProcessorClass: Type[DataProcessingPipelineStepProcessor] = ( - DataProcessingPipelineStepProcessor.get_subclass(DataProcessorSuperClass) - ) - ## Create data processors and output schema: - ## Note: selection of columns from the pipeline config is case insensitive. User might enter 'AbCD' but the - ## appropriate columns 'abcd' will be picked up from the DataFrame schema. - filtered_step_input_schema: MLTypeSchema = PipelineUtil.filter_schema_by_input_patterns( - step_input_schema, - step_cfg.input, - ) - try: - pipeline_step_processors: Dict[Union[str, Tuple[str]], DataProcessingPipelineStepProcessor] = ( - DataProcessingPipelineStepProcessorClass.create_pipeline_step_processors( - DataProcessorClass=DataProcessorClass, - filtered_input_schema=filtered_step_input_schema, - name=step_cfg.transformer, - params=step_cfg.params, - output_pattern=step_cfg.output, - ) - ) - except Exception as e: - print(String.format_exception_msg(e)) - raise AttributeError( - f'Error while creating data processor of type "{str(DataProcessorClass)}" ' - f"with params: {str(step_cfg.params)} " - f"and filtered input schema {str(filtered_step_input_schema)}" - ) - output_schema: MLTypeSchema = ( - DataProcessingPipelineStepProcessorClass.get_pipeline_step_output_schema( - input_schema=step_input_schema, pipeline_step_processors=pipeline_step_processors - ) - ) - return DataProcessingPipelineStep( - input_schema=filtered_step_input_schema, - pipeline_step_processors=pipeline_step_processors, - output_schema=output_schema, - ) - - def execute_pipeline_step( - self, - sdf: ScalableDataFrame, - processing_mode: ProcessingMode, - persist: PersistLevel, - should_measure_perf: bool, - should_log_perf: bool, - ) -> Tuple[ScalableDataFrame, Optional[PipelineStepPerf]]: - """ - Runs the particular pipeline step on the input ScalableDataFrame. - :param sdf: input ScalableDataFrame to process. - :param processing_mode: what this step should do, e.g. fit-transform, transform, etc. - :param persist: how often to persist the ScalableDataFrame every so often. - :param should_measure_perf: whether to measure performance information. - :param should_log_perf: whether to log performance information. - :return: transformed ScalableDataFrame (or raw data) after executing this step - """ - step_start_time = time.perf_counter() - if should_log_perf: - Log.debug(f"\n>> Running {processing_mode.lower().replace('_', '-')} on pipeline step...") - _processor_perfs: List[ProcessorPerf] = [] - for input_cols, step_processors in self.pipeline_step_processors.items(): - data_processor: DataProcessor = step_processors.data_processor - output_col_name: str = step_processors.output_col_name - input_cols: List[str] = as_list(input_cols) - sdf_cols: List[str] = list(sdf.columns) - if ( - is_subset(input_cols, sdf_cols) - or data_processor.missing_column_behavior is MissingColumnBehavior.EXECUTE - ): - ## Apply data processor on whatever subset exists, retaining column order: - cols_to_process_set: Set[str] = get_subset(input_cols, sdf_cols) - cols_to_process_in_order: List[str] = [ - col for col in input_cols if col in cols_to_process_set - ] - if isinstance(data_processor, SingleColumnProcessor): - if len(cols_to_process_in_order) != 1: - raise ValueError(f"Expected only one column, found: {cols_to_process_in_order}") - cols_to_process_in_order: str = cols_to_process_in_order[0] - processor_start_time = time.perf_counter() - if should_log_perf: - Log.debug( - f"\n>> Running {processing_mode.lower().replace('_', '-')} " - f"on {type_str(sdf)}, using:\n{str(data_processor)}" - ) - sdf: ScalableDataFrame = self._execute_data_processor( - sdf=sdf, - cols_to_process_in_order=cols_to_process_in_order, - data_processor=data_processor, - processing_mode=processing_mode, - output_col_name=output_col_name, - ) - persist_time_ms: Optional[float] = None - if persist is PersistLevel.EVERY_PROCESSOR: - sdf, persist_time_ms = measure_time_ms(lambda: sdf.persist(wait=True)) - - processor_end_time: float = time.perf_counter() - if should_log_perf: - Log.debug( - f"\r...processor ran in " - f"{String.readable_seconds(processor_end_time - processor_start_time)}." - ) - if should_measure_perf: - _processor_perfs.append( - ProcessorPerf( - start_time=processor_start_time, - processing_mode=processing_mode, - input_columns=as_list(cols_to_process_in_order), - output_columns=as_list(output_col_name), - data_processor_class_name=data_processor.class_name, - data_processor_params=data_processor.params.dict(), - persist_time_ms=persist_time_ms, - end_time=processor_end_time, - ) - ) - elif data_processor.missing_column_behavior is MissingColumnBehavior.SKIP: - continue - elif data_processor.missing_column_behavior is MissingColumnBehavior.ERROR: - raise ValueError( - f"Cannot transform {type_str(sdf)} using {data_processor.class_name} due to insufficient columns: " - f"columns required for transformation: {input_cols}; " - f"columns actually present: {sdf_cols}" - ) - else: - raise NotImplementedError( - f"Unsupported value for {MissingColumnBehavior}: {data_processor.missing_column_behavior}" - ) - persist_time_ms: Optional[float] = None - if persist is PersistLevel.EVERY_PIPELINE_STEP: - sdf, persist_time_ms = measure_time_ms(lambda: sdf.persist(wait=True)) - - step_end_time: float = time.perf_counter() - if should_log_perf: - Log.debug( - f"\r...pipeline-step ran in {String.readable_seconds(step_end_time - step_start_time)}." - ) - step_end_time: float = time.perf_counter() - if should_measure_perf: - if sdf.layout is not DataLayout.DASK: - sdf_num_rows, length_calculation_ms = measure_time_ms(lambda: len(sdf)) - else: - sdf_num_rows, length_calculation_ms = None, None - return sdf, PipelineStepPerf( - start_time=step_start_time, - processing_mode=processing_mode, - num_rows_processed=sdf_num_rows, - length_calculation_ms=length_calculation_ms, - processor_perfs=_processor_perfs, - persist_time_ms=persist_time_ms, - end_time=step_end_time, - ) - return sdf, None - - def _execute_data_processor( - self, - sdf: ScalableDataFrame, - cols_to_process_in_order: List[str], - data_processor: DataProcessor, - processing_mode: ProcessingMode, - output_col_name: str, - ) -> ScalableDataFrame: - if processing_mode is ProcessingMode.FIT_TRANSFORM: - sdf[output_col_name] = data_processor.fit_transform(sdf[cols_to_process_in_order]) - elif processing_mode is ProcessingMode.TRANSFORM: - sdf[output_col_name] = data_processor.transform(sdf[cols_to_process_in_order]) - return sdf - - -class DataProcessingPipeline(Parameters): - input_schema: MLTypeSchema - pipeline: List[DataProcessingPipelineStep] - output_schema: MLTypeSchema - writers: Dict[FileContents, PipelineWriter] = {} - layout_scaling: Optional[Dict[ProcessingMode, Tuple[Tuple[confloat(ge=1), DataLayout], ...]]] = { - ProcessingMode.FIT_TRANSFORM: ( - ## Determines which layout to use with different number of rows. - (1_000, DataLayout.DICT), ## <= 1k rows, use DataLayout.DICT - (500_000, DataLayout.PANDAS), ## <= 500k rows, use DataLayout.PANDAS - (inf, DataLayout.DASK), ## >500k rows, use DataLayout.DASK - ), - ProcessingMode.TRANSFORM: ( - ## Determines which layout to use with different number of rows. - (5, DataLayout.LIST_OF_DICT), ## <= 5 rows, use DataLayout.LIST_OF_DICT - (1_000, DataLayout.DICT), ## <= 1k rows, use DataLayout.DICT - (125_000, DataLayout.PANDAS), ## <= 125k rows, use DataLayout.PANDAS - (inf, DataLayout.DASK), ## >125k rows, use DataLayout.DASK - ), - } - _performance: List[ProcessingPipelinePerf] = [] ## For performance tracking - - # @classmethod - # @safe_validate_arguments - # def from_steps( - # cls, - # input_schema: MLTypeSchema, - # process: List[Union[ - # DataProcessor, - # Tuple[str, DataProcessor], - # Tuple[DataProcessor, str], - # Tuple[str, DataProcessor, str] - # ]], - # select: List[Union[MLType, str]], - # write: Optional[List[Writer]] = None - # ) -> DataProcessingPipeline: - # process: List = as_list(process) - # select: List = as_list(process) - # if write is not None: - # write: List = as_list(write) - # current_schema: MLTypeSchema = copy.deepcopy(input_schema) - # processing_steps: List[DataProcessingPipelineStep] = [] - # for processor_tuple in process: - # if isinstance(processor_tuple, DataProcessor): - # DataProcessingPipelineStep( - # input_schema=filtered_step_input_schema, - # data_processors=data_processors, - # output_schema=output_schema, - # ) - - @classmethod - @safe_validate_arguments - def from_config( - cls, - config: Union[DataProcessingPipelineConfig, FileMetadata], - input_schema: MLTypeSchema, - only_writers: bool = False, - *args, - **kwargs, - ) -> DataProcessingPipeline: - """ - Static factory to resolve each pipeline step and instantiate the pipeline object. - :param config: either DataProcessingPipelineConfig or config file (YAML/JSON) with pipeline steps and writers. - :param input_schema: schema of the input dataframe this pipeline can process. - :param only_writers: if True, then only the writers will be initialized. - :return: Serializable DataProcessingPipeline instance. - """ - if isinstance(config, FileMetadata): - reader: Reader = Reader.of(config.format) - assert isinstance(reader, ConfigReader) - Log.debug("\nReading pipeline config...") - config: DataProcessingPipelineConfig = DataProcessingPipelineConfig( - **reader.read_metadata(config) - ) - Log.debug("...done reading pipeline config.") - if not only_writers: - return cls._resolve_pipeline( - input_schema=input_schema, - pipeline_steps=config.pipeline, - writers=config.writers_config, - *args, - **kwargs, - ) - else: - return cls._resolve_pipeline( - input_schema=input_schema, - pipeline_steps=[], - writers=config.writers_config, - *args, - **kwargs, - ) - - @classmethod - def _resolve_pipeline( - cls, - input_schema: MLTypeSchema, - pipeline_steps: List[DataProcessingPipelineConfig.StepConfig], - writers: Optional[List[DataProcessingPipelineConfig.WriterConfig]] = None, - *args, - **kwargs, - ) -> DataProcessingPipeline: - """ - Static factory to resolve each pipeline step and instantiate the pipeline object. - :param input_schema: schema of the input dataframe this pipeline can process. - :param pipeline_steps: list of pipeline steps input by the user. - :param writers: list of Dataframe or Algorithm writers input by the user. - Some of these may be invoked when the file with corresponding properties is passed. - :return: Serializable DataProcessingPipeline instance. - """ - - Log.debug("\nInitializing DataProcessingPipeline...") - Log.debug(f"\n> Input schema to pipeline: {input_schema}") - - ## Resolve pipeline steps: - resolved_pipeline: List[DataProcessingPipelineStep] = [] - cur_schema = input_schema - Log.debug(f"\n> Resolving pipeline transformation steps: {str(pipeline_steps)}") - for pipeline_step in pipeline_steps: - resolved_pipeline_step: DataProcessingPipelineStep = DataProcessingPipelineStep.from_config( - step_cfg=pipeline_step, - step_input_schema=cur_schema, - ) - resolved_pipeline.append(resolved_pipeline_step) - Log.debug(f"Added {str(resolved_pipeline_step)}") - cur_schema: MLTypeSchema = resolved_pipeline_step.output_schema - output_schema: MLTypeSchema = cur_schema - Log.debug("...resolved pipeline transformation steps.") - Log.debug(f"\n> Output schema from pipeline: \n{json.dumps(output_schema, indent=4)}") - - ## Resolve writers: - if writers is None: - writers: Dict[FileContents, PipelineWriter] = {} - else: - Log.debug("\n> Resolving pipeline writers...") - writers: Dict[FileContents, PipelineWriter] = cls._resolve_pipeline_writers( - writers=writers, - output_schema=output_schema, - *args, - **kwargs, - ) - Log.debug("...resolved pipeline writers.") - - ## Instantiate: - pipeline = DataProcessingPipeline( - input_schema=input_schema, - pipeline=resolved_pipeline, - output_schema=output_schema, - writers=writers, - ) - Log.debug("...done initializing pipeline.") - return pipeline - - @classmethod - @safe_validate_arguments - def _resolve_pipeline_writers( - cls, - writers: List[DataProcessingPipelineConfig.WriterConfig], - output_schema: MLTypeSchema, - *args, - **kwargs, - ) -> Dict[FileContents, PipelineWriter]: - pipeline_writers: Dict[FileContents, PipelineWriter] = {} - for writer_cfg in writers: - writer: PipelineWriter = cls._create_pipeline_writer( - writer_cfg, - output_schema, - ) - for supported_file_content in writer.file_contents: - if supported_file_content in pipeline_writers: - raise KeyError( - f"Only one writer of {supported_file_content} contents can be present" - f"in the pipeline. Found two writers of type {supported_file_content}." - ) - pipeline_writers[supported_file_content] = writer - Log.debug(f'Set writer of key "{str(supported_file_content)}" as {str(writer)}') - return pipeline_writers - - @classmethod - def _create_pipeline_writer( - cls, - writer_cfg: DataProcessingPipelineConfig.WriterConfig, - output_schema: MLTypeSchema, - ) -> PipelineWriter: - writer_cfg: DataProcessingPipelineConfig.WriterConfig = writer_cfg.copy(deep=True) - WriterClass: Type[Writer] = Writer.get_subclass(writer_cfg.writer) - if not ( - isinstance(WriterClass, DataFrameWriter.__class__) - or isinstance(WriterClass, AlgorithmDatasetWriter.__class__) - ): - raise TypeError( - f"Pipeline writers must be of type {str(DataFrameWriter.class_name)} or " - f"{str(AlgorithmDatasetWriter.class_name)}, found: {WriterClass.class_name}." - ) - - ## Overwrite keys in the output schema with those present in the writer config (if any): - writer_data_schema: MLTypeSchema = { - **output_schema, - **writer_cfg.schema_override, - } - writer_data_schema: MLTypeSchema = PipelineUtil.filter_schema_by_input_patterns( - schema=writer_data_schema, input_patterns=writer_cfg.input - ) - writer_cfg.params["data_schema"] = writer_data_schema - return WriterClass(**writer_cfg.params) - - @safe_validate_arguments - def get_writer_by_file_contents(self, file_contents: FileContents) -> Optional[Writer]: - return self.writers.get(file_contents) - - # def fit(self, df, ) - # def transform(self, df, ) - # def fit_transform(self, df, ) - - @safe_validate_arguments - def execute( - self, - data: Union[ScalableDataFrame, ScalableDataFrameRawType], - processing_mode: ProcessingMode, - process_as: Optional[DataLayout] = None, - measure_perf: FractionalBool = True, - log_perf: FractionalBool = True, - persist: PersistLevel = PersistLevel.DONT_PERSIST, - write_to: Optional[Union[List[FileMetadata], FileMetadata]] = None, - overwrite: bool = False, - rnd: Optional[confloat(ge=0.0, le=1.0)] = None, - **kwargs, - ) -> Union[ScalableDataFrame, ScalableDataFrameRawType]: - """ - Executes each pipeline step on the input DataFrame in a sequential fashion. - :param data: input ScalableDataFrame or raw type (Pandas, Dask, List of Dicts, etc). - :param processing_mode: fit, fit_transform, transform - :param process_as: data layout to run the pipeline. - :param measure_perf: how often to measure performance. - If False, it will not measure performance. If True, it will measure performance. - If 0.0 < measure_perf < 1.0, then we will measure performance a fraction of the time. - :param log_perf: how often to log performance. - If False, it will not log performance. If True, it will always log performance. - If 0.0 < log_perf < 1.0, then we will log performance a fraction of the time. - :param persist: how often to persist processed results (for lazily-evaluated dataframes). - :param write_to: output files to write to using the configured writers. - :param overwrite: whether to overwrite the path while writing. - :param rnd: Optional random value (passed to ensure end-to-end logging and performance measurement). - :return: the transformed DataFrame. - """ - pipeline_start_time = time.time() - if rnd is None: - rnd: float = np.random.random() - should_measure_perf: bool = rnd <= measure_perf - should_log_perf: bool = rnd <= log_perf - - if should_measure_perf: - Log.info(f"\nRunning pipeline in {processing_mode.lower().replace('_', '-')} mode on dataset...") - - ## Detect layout if the input is raw data: - is_input_ScalableDataFrame: bool = isinstance(data, ScalableDataFrame) - sdf, layout_detection_time_ms = measure_time_ms(lambda: ScalableDataFrame.of(data, layout=None)) - input_data_layout: DataLayout = sdf.layout - - ## For lazy-loaded DataFrames (e.g. Dask, Spark), read data from file: - persist_read_time_ms: Optional[float] = None - if persist in { - PersistLevel.BEFORE_PIPELINE, - PersistLevel.BEFORE_AFTER_PIPELINE, - PersistLevel.EVERY_PIPELINE_STEP, - PersistLevel.EVERY_PROCESSOR, - }: - sdf, persist_read_time_ms = measure_time_ms(lambda: sdf.persist(wait=True)) - - ## Convert to different layout used to process: - sdf_num_rows: Optional[int] = None - length_calculation_ms: Optional[float] = None - if process_as is None: - if sdf_num_rows is None: - sdf_num_rows, length_calculation_ms = measure_time_ms(lambda: len(sdf)) - for sdf_num_rows_limit, process_as in self.layout_scaling[processing_mode]: - if sdf_num_rows <= sdf_num_rows_limit: - break ## Sets data_layout - layout_conversion_ms: float = 0.0 - if process_as is not DataLayout.RECORD: - sdf, layout_conversion_ms = measure_time_ms(lambda: sdf.as_layout(layout=process_as)) - - ## Run the pipeline: - pipeline_step_perfs: Optional[List[PipelineStepPerf]] = None - pipeline_steps_compute_time_ms: Optional[float] = None - if len(self.pipeline) > 0: - pipeline_steps_compute_start_time: float = time.time() - if processing_mode is ProcessingMode.TRANSFORM and process_as in { - DataLayout.LIST_OF_DICT, - DataLayout.RECORD, - }: - sdf, pipeline_step_perfs = self._transform_as_records( - sdf=sdf, - processing_mode=processing_mode, - persist=persist, - should_measure_perf=should_measure_perf, - should_log_perf=should_log_perf, - ) - else: - sdf, pipeline_step_perfs = self._execute_as_sdf( - sdf=sdf, - processing_mode=processing_mode, - persist=persist, - should_measure_perf=should_measure_perf, - should_log_perf=should_log_perf, - process_as=process_as, - ) - - pipeline_steps_compute_end_time: float = time.time() - pipeline_steps_compute_time_ms: float = 1000 * ( - pipeline_steps_compute_end_time - pipeline_steps_compute_start_time - ) - - ## For lazy-loaded DataFrames (e.g. Dask, Spark), this actually starts the data-processing: - persist_compute_time_ms: Optional[float] = None - if persist in { - PersistLevel.AFTER_PIPELINE, - PersistLevel.BEFORE_AFTER_PIPELINE, - }: - sdf, persist_compute_time_ms = measure_time_ms(lambda: sdf.persist(wait=True)) - - ## Write data to files - pipeline_writer_perfs: Optional[List[PipelineWriterPerf]] = None - pipeline_write_time_ms: Optional[float] = None - if write_to is not None: - write_to: List[FileMetadata] = as_list(write_to) - pipeline_write_start_time: float = time.time() - pipeline_writer_perfs: List[PipelineWriterPerf] = self._write_processed( - sdf=sdf, - processing_mode=processing_mode, - should_measure_perf=should_measure_perf, - should_log_perf=should_log_perf, - write_to=write_to, - overwrite=overwrite, - **kwargs, - ) - pipeline_write_end_time: float = time.time() - pipeline_write_time_ms: float = 1000 * (pipeline_write_end_time - pipeline_write_start_time) - - ## Log and measure performance - pipeline_end_time: float = time.time() - if should_log_perf: - writers_log_str: str = ( - f" and running {len(self.writers)} writers " if write_to is not None else " " - ) - Log.info( - f"...done running pipeline in {processing_mode.lower().replace('_', '-')} mode{writers_log_str}in " - f"{String.readable_seconds(pipeline_end_time - pipeline_start_time)}." - ) - if should_measure_perf: - if sdf_num_rows is None: - sdf_num_rows, length_calculation_ms = measure_time_ms(lambda: len(sdf)) - pipeline_end_time: float = time.time() - self._performance.append( - ProcessingPipelinePerf( - processing_mode=processing_mode, - persist=persist, - is_input_ScalableDataFrame=is_input_ScalableDataFrame, - should_log_perf=should_log_perf, - start_time=pipeline_start_time, - layout_detection_time_ms=layout_detection_time_ms, - input_data_layout=input_data_layout, - persist_read_time_ms=persist_read_time_ms, - length_calculation_ms=length_calculation_ms, - process_as=process_as, - layout_conversion_ms=layout_conversion_ms, - pipeline_steps_compute_time_ms=pipeline_steps_compute_time_ms, - pipeline_step_perfs=pipeline_step_perfs, - persist_compute_time_ms=persist_compute_time_ms, - pipeline_write_time_ms=pipeline_write_time_ms, - pipeline_writer_perfs=pipeline_writer_perfs, - num_rows_processed=sdf_num_rows, - end_time=pipeline_end_time, - ) - ) - if is_input_ScalableDataFrame: - return sdf - return sdf._data - - def _transform_as_records( - self, - sdf: ScalableDataFrame, - processing_mode: ProcessingMode, - persist: PersistLevel, - should_measure_perf: bool, - should_log_perf: bool, - ) -> Tuple[ScalableDataFrame, List[PipelineStepPerf]]: - record_sdfs: List[ScalableDataFrame] = list( - sdf.stream(stream_as=DataLayout.RECORD, num_rows=1, shuffle=False, raw=False) - ) - for i in range(len(record_sdfs)): - for pipeline_step_i, pipeline_step in enumerate(self.pipeline): - record_sdfs[i], _step_perf = pipeline_step.execute_pipeline_step( - sdf=record_sdfs[i], - processing_mode=processing_mode, - persist=PersistLevel.DONT_PERSIST, - should_measure_perf=should_measure_perf, - should_log_perf=should_log_perf, - ) - ## TODO: log perfs - record_sdf_concat: ScalableDataFrame = ScalableDataFrame.concat( - record_sdfs, - reset_index=True, - layout=sdf.layout, - ) - if record_sdf_concat.layout != sdf.layout: - raise ValueError( - f"Expected the output {ScalableDataFrame.__name__} to have layout " - f"{sdf.layout}; found layout {record_sdf_concat.layout}" - ) - return record_sdf_concat, [] - - def _execute_as_sdf( - self, - sdf: ScalableDataFrame, - processing_mode: ProcessingMode, - persist: PersistLevel, - should_measure_perf: bool, - should_log_perf: bool, - process_as: DataLayout, - ) -> Tuple[ScalableDataFrame, List[PipelineStepPerf]]: - pipeline_step_perfs: List[PipelineStepPerf] = [] - for pipeline_step in self.pipeline: - sdf, _step_perf = pipeline_step.execute_pipeline_step( - sdf=sdf, - processing_mode=processing_mode, - persist=persist, - should_measure_perf=should_measure_perf, - should_log_perf=should_log_perf, - ) - if sdf.layout != process_as: - raise ValueError( - f"Expected the output {ScalableDataFrame.__name__} of the following step to have layout " - f"{process_as}; found layout {sdf.layout}: {str(pipeline_step)}" - ) - if should_measure_perf: - pipeline_step_perfs.append(_step_perf) - return sdf, pipeline_step_perfs - - def _write_processed( - self, - sdf: ScalableDataFrame, - processing_mode: ProcessingMode, - should_measure_perf: bool, - should_log_perf: bool, - write_to: List[FileMetadata], - overwrite: bool = False, - **kwargs, - ) -> Optional[List[PipelineWriterPerf]]: - writers_start_time: float = time.time() - if should_log_perf: - Log.debug( - f"\nWriting dataset after {processing_mode.lower().replace('_', '-')}, " - f"using {len(self.writers)} writers..." - ) - - _writer_perfs: List[PipelineWriterPerf] = [] - for file in write_to: - writer_start_time: float = time.time() - writer: Writer = self.writers.get(file.contents) - if writer is None: - raise KeyError( - f"While writing from pipeline, could not find writer for the following output metadata " - f"(with contents {file.contents}):\n{str(file)}" - ) - if should_log_perf: - Log.debug(f"\n>> Writing processed data using {str(writer)}") - if not writer.write_metadata(file, sdf, overwrite=overwrite, **kwargs): - raise IOError("Could not write pipeline output to file.") - writer_end_time: float = time.time() - if should_log_perf: - Log.debug( - f"\r...writer ran in {String.readable_seconds(writer_end_time - writer_start_time)}." - ) - if should_measure_perf: - _writer_perfs.append( - PipelineWriterPerf( - start_time=writer_start_time, - input_columns=sorted(list(writer.data_schema.keys())), - writer_class_name=writer.class_name, - writer_params=writer.params.dict(), - end_time=writer_end_time, - ) - ) - writers_end_time: float = time.time() - if should_log_perf: - if processing_mode is ProcessingMode.FIT_TRANSFORM: - Log.info( - f"...done running writers in " - f"{String.readable_seconds(writers_end_time - writers_start_time)}." - ) - return _writer_perfs - - def serialize(self, file: str): - """ - Serialize the pipeline object (and all its data processors) using the cloudpickle library, which Ray uses. - Ref: https://github.com/cloudpipe/cloudpickle - """ - ## TODO: create a writer for pickled objects. - Log.debug("\nSerializing pipeline...") - file = String.assert_not_empty_and_strip(file) - with io.open(file, "wb") as out: - cloudpickle.dump(self, out) - Log.debug("...done serializing pipeline.") - - @classmethod - def deserialize(cls, file) -> DataProcessingPipeline: - ## TODO: create a reader for pickled objects. - Log.debug("Reading pipeline file from pickle...") - with io.open(file, "rb") as inp: - pipeline: DataProcessingPipeline = cloudpickle.load(inp) - if not isinstance(pipeline, DataProcessingPipeline): - raise TypeError( - f"Deserialized pipeline is must be an instance of {DataProcessingPipeline.__class__}.", - f"Found object of type {type_str(pipeline)}", - ) - Log.debug("...done reading pipeline file from pickle.") - return pipeline - - -class PipelineUtil: - def __init__(self): - raise TypeError(f"Cannot create {str(self.__class__)} instances.") - - @classmethod - def filter_schema_by_input_patterns(cls, schema: MLTypeSchema, input_patterns: Union[str, List[str]]): - """ - :param schema: Dict where keys are column names and values are strings corresponding to MLTypes. - :param input_patterns: String or list of strings, like '.*_TFIDF', 'NUMERIC', ['TEXT', '.*_TFIDF'] etc. - :return: filtered schema, where we filter based on either the key (if string) or value (if MLType). - """ - filtered_schema: Optional[Dict] = None - if not isinstance(input_patterns, list): - input_patterns = [input_patterns] - filtered_cols = set() - filtered_cols_ordered = [] - for input_pattern in input_patterns: - input_mltype = MLType.from_str(input_pattern, raise_error=False) - if isinstance(input_mltype, MLType): - filtered_mltype_cols = set(keep_values(schema, input_mltype).keys()) - filtered_mltype_cols_list = list(filtered_mltype_cols) - - # This is used for handling cases when Numeric value is present inside String - # Example: If there are two columns named TOP_1_PREDICTED_LABEL and TOP_10_PREDICTED_LABEL - # Then output of sorted would be ['TOP_10_PREDICTED_LABEL', 'TOP_1_PREDICTED_LABEL'] - # This creates problem when using Uncertainty Calculator - # To solve this, first check if all the column names have any digit present. - # If yes, then sort it using key (REF: https://stackoverflow.com/a/49232907) - # If no, then sort lexicographically - - if cls.__do_column_names_have_numeric_values(filtered_mltype_cols_list): - filtered_cols_ordered += [ - col - for col in sorted( - filtered_mltype_cols_list, - key=lambda x: int("".join([i for i in x if i.isdigit()])), - ) - if col not in filtered_cols_ordered - ] - else: - filtered_cols_ordered += [ - col for col in sorted(filtered_mltype_cols_list) if col not in filtered_cols_ordered - ] - filtered_cols = filtered_cols.union(filtered_mltype_cols) - elif isinstance(input_pattern, str): - filtered_str_pattern_cols = set( - filter_string_list(list(schema.keys()), input_pattern, ignorecase=True) - ) - filtered_cols = filtered_cols.union(filtered_str_pattern_cols) - filtered_cols_ordered += [ - col for col in sorted(list(filtered_str_pattern_cols)) if col not in filtered_cols_ordered - ] - else: - raise AttributeError( - f"input_pattern must be a str denoting regex or an MLType, found {input_pattern}" - ) - filtered_schema: Dict = keep_keys(schema, list(filtered_cols)) - filtered_schema_ordered = OrderedDict() - for col in filtered_cols_ordered: - filtered_schema_ordered[col] = filtered_schema[col] - - return filtered_schema_ordered - - @classmethod - def __do_column_names_have_numeric_values(cls, filtered_cols_list: List[str]) -> bool: - return all( - [True if any(char.isdigit() for char in col_name) else False for col_name in filtered_cols_list] - ) diff --git a/src/fmcore/data/processor/DataProcessor.py b/src/fmcore/data/processor/DataProcessor.py deleted file mode 100644 index 893ff0c..0000000 --- a/src/fmcore/data/processor/DataProcessor.py +++ /dev/null @@ -1,101 +0,0 @@ -from abc import ABC, abstractmethod -from typing import * - -from pydantic import root_validator, validator - -from fmcore.constants import DataLayout, MissingColumnBehavior, MLType, MLTypeSchema -from fmcore.data.sdf import ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries -from fmcore.util import MutableParameters, Registry, UserEnteredParameters - - -class DataProcessor(MutableParameters, Registry, ABC): - """ - Abstract base class for all data processors. - - Subclasses of this class should be serializable via pickling. - Subclasses must define the following class variables: - - missing_column_behavior: Used in the context of DataProcessingPipeline. This field determined whether to allow - skipping of transformations when the columns required for those transformations are not present in the DataFrame. - E.g. If the pipeline processes the ground truth labels (such as label-encoding), then during inference time ground - truth labels will not be present and transformations declared on the ground truth column cannot run. - - input_mltypes: Lists the input MLTypes types of the columns the data processor can act take as input. - - output_mltype: Lists the output MLType the data processor will return. This is an instance method since it might - vary depending on the parameters used to initialize the data processor. - """ - - missing_column_behavior: ClassVar[MissingColumnBehavior] = MissingColumnBehavior.ERROR - input_mltypes: ClassVar[Tuple[MLType, ...]] - output_mltype: ClassVar[MLType] - - AlreadyFitError: ClassVar[ValueError] = ValueError(".fit() has already been called.") - FitBeforeTransformError: ClassVar[ValueError] = ValueError(".fit() must be called before .transform()") - - class Params(UserEnteredParameters): - """ - BaseModel for parameters. Expected to be overridden by subclasses of DataProcessor. - Example: - class CaseTransformer(DataProcessor): - class Params(DataProcessor.Params): - case: Literal['lowercase', 'uppercase'] - """ - - pass - - name: str = None - data_schema: Optional[MLTypeSchema] = None - params: Params = {} - - def __str__(self): - params_str: str = self.json(include={"name": True, "data_schema": True, "params": True}, indent=4) - out: str = f"{self.class_name} with params:\n{params_str}" - return out - - @root_validator(pre=True) - def convert_params(cls, params: Dict): - params["params"] = super(DataProcessor, cls)._convert_params(cls.Params, params.get("params")) - return params - - @validator("name") - def set_name(cls, name: Optional[str]): - if name is None: - name: str = cls.class_name - return name - - @abstractmethod - def fit( - self, - data: Union[ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableDataFrameRawType], - process_as: Optional[DataLayout] = None, - ) -> NoReturn: - """ - Fits the data processor instance on the input data. - By default, this is a no-op, i.e. the data processor is assumed to be stateless. - - Any subclass implementation must not modify the input data. - - Any subclass implementation must fit data structure(s) which are serializable via pickling. - :param data: input data which the data processor will use to fit. - :param process_as: data-layout to use while processing. - :return: None - """ - pass - - @abstractmethod - def transform( - self, - data: Union[ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableDataFrameRawType], - process_as: Optional[DataLayout] = None, - ) -> Union[ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableDataFrameRawType]: - """ - Transforms the input data and returns the result. Any subclass implementation must not modify the input data. - :param data: input data which the data processor will act on. - :param process_as: data-layout to use while processing. - :return: transformed result. - """ - pass - - def fit_transform( - self, - data: Union[ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableDataFrameRawType], - process_as: Optional[DataLayout] = None, - ) -> Union[ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableDataFrameRawType]: - self.fit(data, process_as=process_as) - return self.transform(data, process_as=process_as) diff --git a/src/fmcore/data/processor/Nto1ColumnProcessor.py b/src/fmcore/data/processor/Nto1ColumnProcessor.py deleted file mode 100644 index 0859f84..0000000 --- a/src/fmcore/data/processor/Nto1ColumnProcessor.py +++ /dev/null @@ -1,52 +0,0 @@ -from abc import ABC, abstractmethod -from typing import * - -from fmcore.constants import DataLayout -from fmcore.data.processor import DataProcessor -from fmcore.data.sdf import ScalableDataFrame, ScalableOrRaw, ScalableSeries, ScalableSeriesOrRaw, is_scalable -from fmcore.util import safe_validate_arguments - - -class Nto1ColumnProcessor(DataProcessor, ABC): - """Abstract base class for N:1 data processors.""" - - @safe_validate_arguments - def fit( - self, - data: ScalableOrRaw, - process_as: Optional[DataLayout] = None, - ): - data: ScalableDataFrame = ScalableDataFrame.of(data, layout=process_as) - self._fit_df(data) - - def _fit_df(self, data: ScalableDataFrame): - """Fit step is a noop by default.""" - pass - - def __call__(self, *args, **kwargs): - return self.transform(*args, **kwargs) - - @safe_validate_arguments - def transform( - self, - data: ScalableOrRaw, - process_as: Optional[DataLayout] = None, - ) -> ScalableSeriesOrRaw: - output_data: ScalableSeries = self._transform_df(ScalableDataFrame.of(data, layout=process_as)) - if is_scalable(data): - return output_data - return output_data.raw() - - @abstractmethod - def _transform_df(self, data: ScalableDataFrame) -> ScalableSeries: - """N:1 data processors can make optimizations internally as column-wise operations are usually much faster.""" - pass - - @safe_validate_arguments - def fit_transform( - self, - data: ScalableOrRaw, - process_as: Optional[DataLayout] = None, - ) -> ScalableSeries: - self.fit(data, process_as=process_as) - return self.transform(data, process_as=process_as) diff --git a/src/fmcore/data/processor/SingleColumnProcessor.py b/src/fmcore/data/processor/SingleColumnProcessor.py deleted file mode 100644 index 72da3eb..0000000 --- a/src/fmcore/data/processor/SingleColumnProcessor.py +++ /dev/null @@ -1,60 +0,0 @@ -from abc import ABC -from typing import * - -from fmcore.constants import DASK_APPLY_OUTPUT_MLTYPE_TO_META_MAP, DataLayout -from fmcore.data.processor import DataProcessor -from fmcore.data.sdf import ScalableSeries, ScalableSeriesRawType -from fmcore.util import get_current_fn_name - - -class SingleColumnProcessor(DataProcessor, ABC): - """Abstract base class for 1:1 data processors.""" - - def fit( - self, - data: Union[ScalableSeries, ScalableSeriesRawType], - process_as: Optional[DataLayout] = None, - ): - data: ScalableSeries = ScalableSeries.of(data, layout=process_as) - self._fit_series(data) - - def _fit_series(self, data: ScalableSeries): - """Fit step is a noop by default.""" - pass - - def __call__(self, *args, **kwargs): - return self.transform(*args, **kwargs) - - def transform( - self, - data: Union[ScalableSeries, ScalableSeriesRawType], - process_as: Optional[DataLayout] = None, - ) -> Union[ScalableSeries, ScalableSeriesRawType]: - output_data: ScalableSeries = self._transform_series(ScalableSeries.of(data, layout=process_as)) - if isinstance(data, ScalableSeries): - return output_data - return output_data.raw() - - def _transform_series(self, data: ScalableSeries) -> ScalableSeries: - """1:1 data processors can make optimizations internally.""" - kwargs = {} - if data.layout is DataLayout.DASK: - if self.output_mltype in DASK_APPLY_OUTPUT_MLTYPE_TO_META_MAP: - kwargs["meta"] = DASK_APPLY_OUTPUT_MLTYPE_TO_META_MAP[self.output_mltype] - return data.apply(self.transform_single, **kwargs) - - def transform_single(self, data: Any) -> Any: - """ - Transforms a single data point using the current data processor. - :param data: input data point - :return: transformed value - """ - raise NotImplementedError(f"{get_current_fn_name()} has not been implemented.") - - def fit_transform( - self, - data: Union[ScalableSeries, ScalableSeriesRawType], - process_as: Optional[DataLayout] = None, - ) -> ScalableSeries: - self.fit(data, process_as=process_as) - return self.transform(data, process_as=process_as) diff --git a/src/fmcore/data/processor/__init__.py b/src/fmcore/data/processor/__init__.py deleted file mode 100644 index b50afa8..0000000 --- a/src/fmcore/data/processor/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from fmcore.data.processor.DataProcessor import * -from fmcore.data.processor.SingleColumnProcessor import * -from fmcore.data.processor.Nto1ColumnProcessor import * -from fmcore.data.processor.mixins import * -from fmcore.data.processor.categorical import * -from fmcore.data.processor.numeric import * -from fmcore.data.processor.text import * -from fmcore.data.processor.vector import * \ No newline at end of file diff --git a/src/fmcore/data/processor/categorical/CategoricalMissingValueImputation.py b/src/fmcore/data/processor/categorical/CategoricalMissingValueImputation.py deleted file mode 100644 index 61e1441..0000000 --- a/src/fmcore/data/processor/categorical/CategoricalMissingValueImputation.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import * - -from pydantic import root_validator - -from fmcore.data.processor import CategoricalInputProcessor, CategoricalOutputProcessor, SingleColumnProcessor -from fmcore.data.sdf import ScalableSeries -from fmcore.util import AutoEnum, auto, is_null - - -class CategoricalImputationStrategy(AutoEnum): - MODE = auto() - CONSTANT = auto() - - -class CategoricalMissingValueImputation( - SingleColumnProcessor, CategoricalInputProcessor, CategoricalOutputProcessor -): - """ - This calculates or fills in the value to be filled in place of nan based on strategy passed as input. - Params: - - FILL_VALUE: the value to be filled in when it encounters a NaN (This must be only passed when CONSTANT is strategy) - - STRATEGY: this indicates what strategy must be used when NaN is encountered - - MODE: The number which appears most often in a set of numbers - - CONSTANT: This allows the user to pass in a fill value where that fill value will be imputed - """ - - class Params(SingleColumnProcessor.Params): - strategy: CategoricalImputationStrategy - fill_value: Optional[Any] = None - - imputed_value: Optional[Any] = None - - @root_validator(pre=False) - def set_imputed_value(cls, params: Dict): - if params["params"].strategy is CategoricalImputationStrategy.CONSTANT: - if params["params"].fill_value is None: - raise ValueError( - f"Cannot have empty `fill_value` when `strategy` is {CategoricalImputationStrategy.CONSTANT}" - ) - params["imputed_value"] = params["params"].fill_value - elif params["params"].fill_value is not None: - raise ValueError( - f"`fill_value` can only be passed when strategy={CategoricalImputationStrategy.CONSTANT}" - ) - return params - - def _fit_series(self, data: ScalableSeries): - if self.params.strategy is not CategoricalImputationStrategy.CONSTANT: - if self.imputed_value is not None: - raise self.AlreadyFitError - if self.params.strategy is CategoricalImputationStrategy.MODE: - self.imputed_value = self._get_mode(data) - else: - raise NotImplementedError(f"Unsupported strategy: {self.params.strategy}") - - def _get_mode(self, data: ScalableSeries) -> Any: - imputed_value: Any = data.mode().compute().iloc[0] - if not isinstance(imputed_value, str): - if float(imputed_value).is_integer(): - return int(imputed_value) - return imputed_value - - def transform_single(self, data: Optional[Any]) -> Any: - if self.imputed_value is None and self.params.strategy is not CategoricalImputationStrategy.CONSTANT: - raise self.FitBeforeTransformError - if is_null(data): - data = self.imputed_value - return data diff --git a/src/fmcore/data/processor/categorical/LabelAffix.py b/src/fmcore/data/processor/categorical/LabelAffix.py deleted file mode 100644 index 44adf76..0000000 --- a/src/fmcore/data/processor/categorical/LabelAffix.py +++ /dev/null @@ -1,35 +0,0 @@ -from typing import * - -from pydantic import constr - -from fmcore.data.processor import ( - EncodedLabelOutputProcessor, - SingleColumnProcessor, - TextOrLabelInputProcessor, -) -from fmcore.util import is_null - - -class LabelAffix(SingleColumnProcessor, TextOrLabelInputProcessor, EncodedLabelOutputProcessor): - """ - Adds a suffix or prefix (or both) to a label. - - Params: - - PREFIX: option prefix to the label - - SUFFIX: option suffix to the label - """ - - class Params(SingleColumnProcessor.Params): - prefix: constr(min_length=0) = "" - suffix: constr(min_length=0) = "" - - # def _transform_series(self, data: ScalableSeries) -> ScalableSeries: - # nulls: ScalableSeries = data.isna() - # data = self.params.prefix + data.fillna('').astype(str) + self.params.suffix - # data[nulls] = None - # return data - - def transform_single(self, data: Optional[Any]) -> Optional[str]: - if is_null(data): - return None - return self.params.prefix + str(data) + self.params.suffix diff --git a/src/fmcore/data/processor/categorical/LabelEncoding.py b/src/fmcore/data/processor/categorical/LabelEncoding.py deleted file mode 100644 index 67b3bd5..0000000 --- a/src/fmcore/data/processor/categorical/LabelEncoding.py +++ /dev/null @@ -1,206 +0,0 @@ -from typing import * - -import numpy as np -from pydantic import root_validator - -from fmcore.data.processor import ( - EncodedLabelOutputProcessor, - SingleColumnProcessor, - TextOrLabelInputProcessor, -) -from fmcore.data.sdf import ScalableSeries, ScalableSeriesRawType -from fmcore.util import AutoEnum, auto, is_null, type_str - - -class EncodingRange(AutoEnum): - ONE_TO_N = auto() - ZERO_TO_N_MINUS_ONE = auto() - BINARY_ZERO_ONE = auto() - BINARY_PLUS_MINUS_ONE = auto() - - -ENCODING_RANGE_TO_UNKNOWN_LABELS_MAP = { - EncodingRange.ONE_TO_N: 0, - EncodingRange.BINARY_ZERO_ONE: -1, - EncodingRange.BINARY_PLUS_MINUS_ONE: 0, - EncodingRange.ZERO_TO_N_MINUS_ONE: -1, -} - -BINARY_POSITIVE_LABELS: Set[str] = {"1", "Y", "YES", "TRUE", "T"} -BINARY_NEGATIVE_LABELS: Set[str] = {"0", "-1", "N", "NO", "FALSE", "F"} - -LabelEncoding = "LabelEncoding" - - -class LabelEncoding(SingleColumnProcessor, TextOrLabelInputProcessor, EncodedLabelOutputProcessor): - """ - Fits a list of categorical or integer values and transforms each into an integer value. - Params: - - ENCODING_RANGE: the output range of integer values, must be long to enum EncodingRange. Values: - - ONE_TO_N: encodes to 1, 2, 3, ... N (number of unique labels) - - ZERO_TO_N_MINUS_ONE: encodes to 0, 1, 2, ... N-1 (number of unique labels) - - BINARY_ZERO_ONE: encodes to 0 or 1. Throws an exception if the labels are not binary. - - BINARY_PLUS_MINUS_ONE: encodes to -1 or +1. Throws an exception if the labels are not binary. - - MISSING_INPUT_FILL_VALUE: the value to fill for None/NaN labels. - - UNKNOWN_INPUT_ENCODING_VALUE: the encoding value to fill for labels which are present in the data passed to the - transform step but not present in the data used to fit the transformer. - """ - - aliases = ["LabelEncoder"] - - class Params(SingleColumnProcessor.Params): - encoding_range: EncodingRange = EncodingRange.ONE_TO_N - missing_input_fill_value: Optional[Any] = None - unknown_input_encoding_value: Optional[Any] = None - label_normalizer: Optional[Callable[[Any], str]] = None - - @root_validator(pre=False) - def set_unknown_input_encoding_value(cls, params): - if params.get("unknown_input_encoding_value") is None: - params["unknown_input_encoding_value"]: Any = ENCODING_RANGE_TO_UNKNOWN_LABELS_MAP[ - params["encoding_range"] - ] - return params - - label_encoding_dict: Dict[Any, int] = None ## Stores normalized labels if label_normalizer is not None - label_decoding_dict: Dict[int, Any] = None ## Stores normalized labels if label_normalizer is not None - - @classmethod - def from_labelspace( - cls, - labelspace: Union[Set, List, Tuple], - label_encoding_range: EncodingRange, - label_normalizer: Callable[[Any], str], - ) -> LabelEncoding: - """ - Static factory to create a LabelEncoding object from a list/set/tuple of labels. - :param labelspace: complete set of labels for an ML training dataset. - :param label_encoding_range: the range of values to which we should encode the labels. - :param label_normalizer: function to normalize labels. - :return: LabelEncoding object. - """ - if len(labelspace) == 2: - lb1, lb2 = tuple(labelspace) ## Assume normalized beforehand. - lb1: str = label_normalizer(lb1) - lb2: str = label_normalizer(lb2) - if lb1.upper() in BINARY_NEGATIVE_LABELS and lb2.upper() in BINARY_POSITIVE_LABELS: - return LabelEncoding( - label_encoding_dict={lb1: 0, lb2: 1}, - label_decoding_dict={0: lb1, 1: lb2}, - params=dict( - encoding_range=EncodingRange.BINARY_ZERO_ONE, - label_normalizer=label_normalizer, - ), - ) - elif lb1.upper() in BINARY_POSITIVE_LABELS and lb2.upper() in BINARY_NEGATIVE_LABELS: - return LabelEncoding( - label_encoding_dict={lb2: 0, lb1: 1}, - label_decoding_dict={0: lb2, 1: lb1}, - params=dict( - encoding_range=EncodingRange.BINARY_ZERO_ONE, - label_normalizer=label_normalizer, - ), - ) - label_encoding_range: EncodingRange = EncodingRange.BINARY_ZERO_ONE - label_encoder: LabelEncoding = LabelEncoding( - params=dict( - encoding_range=label_encoding_range, - label_normalizer=label_normalizer, - ) - ) - label_encoder.fit(np.array(list(labelspace))) - return label_encoder - - def _fit_series(self, data: ScalableSeries): - ## Cannot use np.unique with NaNs in the data, as it replicates the nans: - labels: np.ndarray = self._fill_missing_values(data).dropna().numpy() - if self.params.missing_input_fill_value is not None: - labels: np.ndarray = np.append(labels, self.params.missing_input_fill_value) - labels: np.ndarray = np.unique(labels) ## Makes unique. - if self.params.label_normalizer is not None: - ## Normalize labels before encoding: - labels: np.ndarray = np.array([self.params.label_normalizer(lb) for lb in labels]) - labels: np.ndarray = np.unique(labels) ## Makes unique post-normalization. - ## The 2nd return param is an index of the unique labels, i.e. an encoding from 0 to N-1: - labels, encoded_labels = np.unique(labels, return_inverse=True) - num_labels, num_encodings = len(labels), len(encoded_labels) - if num_labels == 0: - raise ValueError("Input data must contain at least one non-null entry.") - if num_labels != num_encodings: - raise ValueError( - "Each label should have exactly one encoding. " - + f"Found: no. unique labels={num_labels}, no. encodings={num_encodings}" - ) - ## Adjust label encoding based on encoding range: - if self.params.encoding_range is EncodingRange.ZERO_TO_N_MINUS_ONE: - self.label_encoding_dict: Dict[Any, int] = dict(zip(labels, encoded_labels)) - elif self.params.encoding_range is EncodingRange.ONE_TO_N: - ## encoded_labels goes from 0 to N-1 - self.label_encoding_dict: Dict[Any, int] = dict(zip(labels, encoded_labels + 1)) - elif self.params.encoding_range is EncodingRange.BINARY_ZERO_ONE: - if num_labels > 2: - raise ValueError( - f"{EncodingRange.BINARY_ZERO_ONE} encoding supports <=2 labels, found {num_labels}" - ) - self.label_encoding_dict: Dict[Any, int] = {labels[0]: 0} - if num_labels == 2: - self.label_encoding_dict[labels[1]] = 1 - elif self.params.encoding_range is EncodingRange.BINARY_PLUS_MINUS_ONE: - if num_labels > 2: - raise ValueError( - f"{EncodingRange.BINARY_PLUS_MINUS_ONE} needs <=2 labels, found {num_labels}" - ) - self.label_encoding_dict: Dict[Any, int] = {labels[0]: -1} - if num_labels == 2: - self.label_encoding_dict[labels[1]] = 1 - else: - raise NotImplementedError(f"Unsupported encoding range: {self.params.encoding_range}") - self.label_decoding_dict: Dict[int, Any] = {v: k for k, v in self.label_encoding_dict.items()} - - def _transform_series(self, data: ScalableSeries) -> ScalableSeries: - if self.label_encoding_dict is None: - raise self.FitBeforeTransformError - data: ScalableSeries = self._fill_missing_values(data) - if self.params.label_normalizer is not None: - data: ScalableSeries = data.map(self.params.label_normalizer, na_action="ignore") - return data.map(self.label_encoding_dict, na_action="ignore").fillna( - self.params.unknown_input_encoding_value - ) - - def transform_single(self, data: Optional[Any]) -> int: - if self.label_encoding_dict is None: - raise self.FitBeforeTransformError - data = self._fill_missing_value(data) - return int(self.label_encoding_dict.get(data, self.params.unknown_input_encoding_value)) - - def inverse_transform_series( - self, - data: Union[ScalableSeries, ScalableSeriesRawType], - ) -> Union[ScalableSeries, ScalableSeriesRawType]: - if self.label_decoding_dict is None: - raise self.FitBeforeTransformError - output: ScalableSeries = ScalableSeries.of(data).map(self.label_decoding_dict, na_action="ignore") - if not isinstance(data, ScalableSeries): - output: ScalableSeriesRawType = output.raw() - return output - - def inverse_transform_single(self, data: int) -> Optional[str]: - if self.label_decoding_dict is None: - raise self.FitBeforeTransformError - if not isinstance(data, int): - raise ValueError( - f"Expected input data to be an integer; found {type_str(data)} having value: {data}" - ) - return self.label_decoding_dict.get(data) - - def _fill_missing_value(self, data: Any): - """TODO: replace this with a transformer or util which imputes missing values.""" - if is_null(data) and self.params.missing_input_fill_value is not None: - return self.params.missing_input_fill_value - return data - - def _fill_missing_values(self, data: ScalableSeries) -> ScalableSeries: - """TODO: replace this with a transformer or util which imputes missing values.""" - if self.params.missing_input_fill_value is not None: - return data.fillna(self.params.missing_input_fill_value) - return data diff --git a/src/fmcore/data/processor/categorical/__init__.py b/src/fmcore/data/processor/categorical/__init__.py deleted file mode 100644 index 226e510..0000000 --- a/src/fmcore/data/processor/categorical/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fmcore.data.processor.categorical.CategoricalMissingValueImputation import * -from fmcore.data.processor.categorical.LabelAffix import * -from fmcore.data.processor.categorical.LabelEncoding import * diff --git a/src/fmcore/data/processor/mixins.py b/src/fmcore/data/processor/mixins.py deleted file mode 100644 index b100fc8..0000000 --- a/src/fmcore/data/processor/mixins.py +++ /dev/null @@ -1,137 +0,0 @@ -from abc import ABC -from typing import * - -from fmcore.constants import MissingColumnBehavior, MLType -from fmcore.data.processor import DataProcessor - - -class NumericInputProcessor(DataProcessor, ABC): - """Mixin for numeric input data processors.""" - - input_mltypes = [MLType.INT, MLType.FLOAT] - - -class CategoricalInputProcessor(DataProcessor, ABC): - """Mixin for categorical input data processors.""" - - input_mltypes = [MLType.INT, MLType.CATEGORICAL] - - -class CategoricalOutputProcessor(DataProcessor, ABC): - """Mixin for categorical output data processors.""" - - output_mltype = MLType.CATEGORICAL - - -class IntegerOutputProcessor(DataProcessor, ABC): - """Mixin for integer output data processors.""" - - output_mltype = MLType.INT - - -class DecimalOutputProcessor(DataProcessor, ABC): - """Mixin for decimal output data processors.""" - - output_mltype = MLType.FLOAT - - -class EncodedLabelOutputProcessor(DataProcessor, ABC): - """Mixin for label output data processors.""" - - output_mltype = MLType.ENCODED_LABEL - - -class TextInputProcessor(DataProcessor, ABC): - """Mixin for text input data processors.""" - - input_mltypes = [ - MLType.TEXT, - MLType.CATEGORICAL, - MLType.INT, - MLType.FLOAT, - MLType.BOOL, - ] - - -class VectorAssemblerInputProcessor(DataProcessor, ABC): - """Mixin for vectorAssembler input data processors.""" - - input_mltypes = [MLType.INT, MLType.FLOAT, MLType.VECTOR, MLType.SPARSE_VECTOR] - - -class LabelInputProcessor(DataProcessor, ABC): - """Mixin for label input data processors.""" - - missing_column_behavior = MissingColumnBehavior.SKIP - - input_mltypes = [ - MLType.GROUND_TRUTH_LABEL, - MLType.ENCODED_LABEL, - MLType.PREDICTED_LABEL, - MLType.ENCODED_PREDICTED_LABEL, - ] - - -class TextOrLabelInputProcessor(DataProcessor, ABC): - """Mixin for text or label input data processors.""" - - missing_column_behavior = MissingColumnBehavior.SKIP - input_mltypes = LabelInputProcessor.input_mltypes + TextInputProcessor.input_mltypes - - -class TextOutputProcessor(DataProcessor, ABC): - """Mixin for text output data processors.""" - - output_mltype = MLType.TEXT - - -class BoolOutputProcessor(DataProcessor, ABC): - """Mixin for bool output data processors.""" - - output_mltype = MLType.BOOL - - -class VectorInputProcessor(DataProcessor, ABC): - """Mixin for vector input data processors.""" - - input_mltypes = [MLType.VECTOR] - - -class VectorOutputProcessor(DataProcessor, ABC): - """Mixin for vector output data processors.""" - - output_mltype = MLType.VECTOR - - -class SparseVectorInputProcessor(DataProcessor, ABC): - """Mixin for sparse vector input data processors.""" - - input_mltypes = [MLType.SPARSE_VECTOR] - - -class SparseVectorOutputProcessor(DataProcessor, ABC): - """Mixin for sparse vector output data processors.""" - - output_mltype = MLType.SPARSE_VECTOR - - -class NonVectorInputProcessor(DataProcessor, ABC): - """Mixin for non-vector input data processors.""" - - input_mltypes = list(set(MLType).difference({MLType.VECTOR, MLType.SPARSE_VECTOR})) - - -class PredictionsInputProcessor(DataProcessor, ABC): - """Mixin for algorithm predictions data input data processors.""" - - input_mltypes = [ - MLType.INDEX, - MLType.GROUND_TRUTH_LABEL, - MLType.ENCODED_LABEL, - MLType.PROBABILITY_SCORE, - MLType.PROBABILITY_SCORE_COMMA_SEPERATED_OR_LIST, - MLType.PREDICTED_LABEL, - MLType.PREDICTED_LABEL_COMMA_SEPARATED_OR_LIST, - MLType.ENCODED_PREDICTED_LABEL, - MLType.PREDICTED_CORRECT, - ] diff --git a/src/fmcore/data/processor/numeric/NumericMissingValueImputation.py b/src/fmcore/data/processor/numeric/NumericMissingValueImputation.py deleted file mode 100644 index 000ac88..0000000 --- a/src/fmcore/data/processor/numeric/NumericMissingValueImputation.py +++ /dev/null @@ -1,75 +0,0 @@ -from typing import * - -import pandas as pd -from pydantic import root_validator - -from fmcore.constants import MLType -from fmcore.data.processor import SingleColumnProcessor -from fmcore.util import AutoEnum, auto, is_null - - -class NumericImputationStrategy(AutoEnum): - MEAN = auto() - MEDIAN = auto() - MODE = auto() - MIN = auto() - MAX = auto() - CONSTANT = auto() - - -class NumericMissingValueImputation(SingleColumnProcessor): - """ - This calculates or fills in the value to be filled in place of nan based on strategy passed as input. - Params: - - FILL_VALUE: the value to be filled in when it encounters a NaN (This must be only passed when CONSTANT is strategy) - - STRATEGY: this indicates what strategy must be used when NaN is encountered - - MEAN: The "average" you're used to, where you add up all the numbers and then divide by the number of numbers - - MEDIAN: The "median" is the "middle" value in the list of numbers - - MODE: The number which appears most often in a set of numbers - - MIN: The minimum value of the series - - MAX: The Maximum value of the series - - CONSTANT: This allows the user to pass in a fill value where that fill value will be imputed - """ - - input_mltypes = [MLType.INT, MLType.FLOAT] - output_mltype = MLType.FLOAT - IMPUTE_FN_MAP: ClassVar[Dict[NumericImputationStrategy, Callable]] = { - NumericImputationStrategy.MODE: lambda _data: _data.mode(dropna=True).compute().iloc[0], - NumericImputationStrategy.MEAN: lambda _data: _data.mean(skipna=True), - NumericImputationStrategy.MEDIAN: lambda _data: _data.median(skipna=True), - NumericImputationStrategy.MIN: lambda _data: _data.min(skipna=True), - NumericImputationStrategy.MAX: lambda _data: _data.max(skipna=True), - } - - class Params(SingleColumnProcessor.Params): - strategy: NumericImputationStrategy - fill_value: Optional[Any] = None - - imputed_value: Optional[Any] = None - - @root_validator(pre=False) - def set_imputed_value(cls, params: Dict): - if params["params"].strategy is NumericImputationStrategy.CONSTANT: - if params["params"].fill_value is None: - raise ValueError( - f"Cannot have empty `fill_value` when `strategy` is {NumericImputationStrategy.CONSTANT}" - ) - params["imputed_value"] = params["params"].fill_value - elif params["params"].fill_value is not None: - raise ValueError( - f"`fill_value` can only be passed when strategy={NumericImputationStrategy.CONSTANT}" - ) - return params - - def _fit_series(self, data: pd.Series): - if self.params.strategy is not NumericImputationStrategy.CONSTANT: - if self.imputed_value is not None: - raise self.AlreadyFitError - self.imputed_value: Any = self.IMPUTE_FN_MAP[self.params.strategy](data) - - def transform_single(self, data: Optional[Any]) -> Any: - if self.imputed_value is None and self.params.strategy is not NumericImputationStrategy.CONSTANT: - raise self.FitBeforeTransformError - if is_null(data): - data = self.imputed_value - return data diff --git a/src/fmcore/data/processor/numeric/__init__.py b/src/fmcore/data/processor/numeric/__init__.py deleted file mode 100644 index 89d60ed..0000000 --- a/src/fmcore/data/processor/numeric/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from fmcore.data.processor.numeric.NumericMissingValueImputation import * diff --git a/src/fmcore/data/processor/text/CaseTransformation.py b/src/fmcore/data/processor/text/CaseTransformation.py deleted file mode 100644 index df4a5a0..0000000 --- a/src/fmcore/data/processor/text/CaseTransformation.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import * - -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.util import AutoEnum, auto, is_null - - -class Case(AutoEnum): - UPPER = auto() - LOWER = auto() - - -class CaseTransformation(SingleColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Transforms the text case to uppercase or lowercase. - - Params: - - CASE: must be the string 'upper' or 'lower'. - """ - - class Params(SingleColumnProcessor.Params): - case: Case = Case.LOWER - - def transform_single(self, data: Optional[str]) -> Optional[str]: - if is_null(data): - return None - if self.params.case is Case.LOWER: - return data.lower() - elif self.params.case is Case.UPPER: - return data.upper() - raise NotImplementedError(f"Unsupported case: {self.params.case}") diff --git a/src/fmcore/data/processor/text/HtmlTagRemoval.py b/src/fmcore/data/processor/text/HtmlTagRemoval.py deleted file mode 100644 index 285b1ba..0000000 --- a/src/fmcore/data/processor/text/HtmlTagRemoval.py +++ /dev/null @@ -1,19 +0,0 @@ -import re -from typing import * - -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.util import is_null - - -class HtmlTagRemoval(SingleColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Removes HTML tags from the text. Leaves the content between tags untouched. - An HTML tag is recognized as anything between a pair of crocodile brackets, e.g.

, < p>, < p >, < /p html >, etc. - """ - - HTML_REGEX: ClassVar = re.compile("<.*?>") - - def transform_single(self, data: Optional[str]) -> Optional[str]: - if is_null(data): - return None - return self.HTML_REGEX.sub("", data) diff --git a/src/fmcore/data/processor/text/PunctuationCleaner.py b/src/fmcore/data/processor/text/PunctuationCleaner.py deleted file mode 100644 index c2a9d7a..0000000 --- a/src/fmcore/data/processor/text/PunctuationCleaner.py +++ /dev/null @@ -1,23 +0,0 @@ -import string -from typing import * - -from pydantic import constr - -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.util import String, is_null - - -class PunctuationCleaner(SingleColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Replaces punctuations with spaces. - """ - - class Params(SingleColumnProcessor.Params): - replacement_char: constr(min_length=1) = String.SPACE - - def transform_single(self, data: Optional[str]) -> Optional[str]: - if is_null(data): - return None - return data.translate( - str.maketrans(string.punctuation, self.params.replacement_char * len(string.punctuation)) - ) diff --git a/src/fmcore/data/processor/text/RegexSubstitution.py b/src/fmcore/data/processor/text/RegexSubstitution.py deleted file mode 100644 index eb61d8c..0000000 --- a/src/fmcore/data/processor/text/RegexSubstitution.py +++ /dev/null @@ -1,53 +0,0 @@ -import re -from typing import * - -from pydantic import constr, root_validator - -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.util import is_null - - -class RegexSubstitution(SingleColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Replaces each matched regex pattern in a list with the corresponding substitution pattern. - - Params: - - SUBSTITUTION_LIST: a list of 2-tuples, where the first element is the regex to match and the second is the - substitution (which might be string or regex, controllable via SUBSTITUTE_IS_REGEX). - This list of substitutions will be applied on the input text sequentially. - - IGNORECASE: whether to ignore case during regex matching. - - MULTILINE: whether to do multiline mathcing during regex matching. - - SUBSTITUTE_IS_REGEX: whether the substitution is a regex expression. If set to True, the transformer will compile - the substitution as regex during replacement, allowing usage of capturing groups etc. - """ - - class Params(SingleColumnProcessor.Params): - substitution_list: List[Tuple[constr(min_length=1), constr(min_length=0)]] - ignorecase: bool = False - multiline: bool = True - substitute_is_regex: bool = True - flags: Optional[int] = None - match_patterns: Dict[constr(min_length=1), Any] = None - - @root_validator(pre=False) - def set_flags(cls, params): - flags = 0 - if params["ignorecase"]: - flags |= re.IGNORECASE - if params["multiline"]: - flags |= re.MULTILINE - params["flags"] = flags - params["match_patterns"] = { - regex_pattern: re.compile(regex_pattern, flags=flags) - for regex_pattern, _ in params["substitution_list"] - } - return params - - def transform_single(self, data: Optional[str]) -> Optional[str]: - if is_null(data): - return None - for regex_pattern, sub_str in self.params.substitution_list: - match_pattern = self.params.match_patterns[regex_pattern] - sub_pattern = sub_str if not self.params.substitute_is_regex else r"%s" % (sub_str) - data: str = match_pattern.sub(sub_pattern, data) - return data diff --git a/src/fmcore/data/processor/text/StringRemoval.py b/src/fmcore/data/processor/text/StringRemoval.py deleted file mode 100644 index 0b1b674..0000000 --- a/src/fmcore/data/processor/text/StringRemoval.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import * - -from pydantic import validator - -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.util import is_list_like, is_null - - -class StringRemoval(SingleColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Removes certain strings from each text string using str.replace() (no regex matching). - - Params: - - REMOVAL_LIST: the list of strings to remove. - """ - - class Params(SingleColumnProcessor.Params): - removal_list: List[str] - - @validator("removal_list") - def check_removal_list(cls, removal_list: List): - if len(removal_list) == 0 or not is_list_like(removal_list): - raise ValueError("`removal_list` should be a non-empty list of strings") - return list(removal_list) - - def transform_single(self, data: Optional[str]) -> Optional[str]: - if is_null(data): - return None - for s in self.params.removal_list: - data: str = data.replace(s, "") - return data diff --git a/src/fmcore/data/processor/text/TFIDFVectorization.py b/src/fmcore/data/processor/text/TFIDFVectorization.py deleted file mode 100644 index a73d344..0000000 --- a/src/fmcore/data/processor/text/TFIDFVectorization.py +++ /dev/null @@ -1,66 +0,0 @@ -from ast import literal_eval -from typing import * - -import numpy as np -from pydantic import root_validator, validator -from scipy.sparse import csr_matrix as SparseCSRMatrix -from sklearn.feature_extraction.text import TfidfVectorizer - -from fmcore.constants import MLType -from fmcore.data.processor import SingleColumnProcessor, TextInputProcessor -from fmcore.data.processor.vector.VectorDensifier import VectorDensifier -from fmcore.data.sdf import ScalableSeries -from fmcore.util import if_else - - -class TFIDFVectorization(SingleColumnProcessor, TextInputProcessor): - """ - Performs TF-IDF Vectorization of a text column using sklearn's TFIDFVectorizer. - Ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html - Params: - - OUTPUT_SPARSE: whether to output each row as a sparse row matrix (1 x N). If False, will output a 1d numpy array. - - SKLEARN_PARAMS: dictionary of sklearn params to be unpacked as keyword arguments to the constructor - sklearn.feature_extraction.text.TfidfVectorizer. Thus, keys are case-sensitive. - """ - - class Params(SingleColumnProcessor.Params): - sklearn_params: Dict = {} - output_sparse: bool = False - - @validator("sklearn_params", pre=True) - def process_sklearn_tfidf_params(cls, sklearn_tfidf_params: Dict): - token_pattern: Optional = sklearn_tfidf_params.get("token_pattern") - if token_pattern is not None: - sklearn_tfidf_params["token_pattern"] = str(sklearn_tfidf_params.get("token_pattern")) - ngram_range: Optional = sklearn_tfidf_params.get("ngram_range") - if ngram_range is not None: - if isinstance(ngram_range, str): - ngram_range = literal_eval(ngram_range) - if isinstance(ngram_range, list): - ngram_range = tuple(ngram_range) - assert isinstance(ngram_range, tuple) - sklearn_tfidf_params["ngram_range"] = ngram_range - return sklearn_tfidf_params - - output_mltype = MLType.VECTOR - vectorizer: TfidfVectorizer = None - vector_densifier: VectorDensifier = None - - @root_validator(pre=False) - def set_vectorizer(cls, params: Dict): - params["vectorizer"]: TfidfVectorizer = TfidfVectorizer(**params["params"].sklearn_params) - params["vector_densifier"]: VectorDensifier = VectorDensifier() - params["output_mltype"]: MLType = if_else( - params["params"].output_sparse, MLType.SPARSE_VECTOR, MLType.VECTOR - ) - return params - - def _fit_series(self, data: ScalableSeries): - self.vectorizer.fit(data.pandas()) ## TODO: Super slow, replace with Dask TFIDF - - def transform_single(self, data: str) -> Union[SparseCSRMatrix, np.ndarray]: - ## Will output a sparse matrix with only a single row. - tfidf_vec: SparseCSRMatrix = self.vectorizer.transform([data]) - if not self.params.output_sparse: - tfidf_vec: np.ndarray = self.vector_densifier.transform_single(tfidf_vec) - return tfidf_vec diff --git a/src/fmcore/data/processor/text/TextConcatenation.py b/src/fmcore/data/processor/text/TextConcatenation.py deleted file mode 100755 index cc39b1b..0000000 --- a/src/fmcore/data/processor/text/TextConcatenation.py +++ /dev/null @@ -1,90 +0,0 @@ -from typing import * - -from pydantic import constr, root_validator - -from fmcore.data.processor import Nto1ColumnProcessor, TextInputProcessor, TextOutputProcessor -from fmcore.data.sdf import ScalableDataFrame, ScalableSeries -from fmcore.util import AutoEnum, String, auto, is_list_like, is_null - - -class ColumnOrder(AutoEnum): - SORT_BY_NAME_ASCENDING = auto() - SORT_BY_NAME_DESCENDING = auto() - SORT_BY_SHORTEST_FIRST = auto() - INPUT_ORDER = auto() - - -class TextConcatenation(Nto1ColumnProcessor, TextInputProcessor, TextOutputProcessor): - """ - Concatenates text from multiple columns into a single column. - For non-text columns, converts to string and then concatenates. - - Params: - - SEP: the separator between columns in the combined text string. - - COLUMN_ORDER: which way to order columns. - """ - - class Params(Nto1ColumnProcessor.Params): - sep: constr(min_length=1) = String.SPACE - column_order: ColumnOrder = ( - ColumnOrder.SORT_BY_NAME_ASCENDING - ) ## Do not change this for legacy reasons. - input_ordering: Optional[List[str]] = None - prefix_col_name: Optional[bool] = False - prefix_col_sep: Optional[str] = ": " - allow_missing: Optional[bool] = False - - ordered_cols: Optional[List[str]] = None - - @root_validator(pre=False) - def set_ordered_cols(cls, params: Dict): - if params["params"].column_order is ColumnOrder.INPUT_ORDER: - if not is_list_like(params["params"].input_ordering): - raise ValueError( - f"`input_ordering` must be a non-empty list when column_order={ColumnOrder.INPUT_ORDER}" - ) - params["ordered_cols"]: List[str] = params["params"].input_ordering - return params - - def _fit_df(self, data: ScalableDataFrame): - cols: List[str] = list(data.columns) - if self.params.column_order is ColumnOrder.SORT_BY_SHORTEST_FIRST: - avg_column_length: Dict[str, float] = { - col: data[col].dropna().astype(str).apply(len).mean() for col in cols - } - ## Sort first by avg. length, then by column name: - self.ordered_cols: List[str] = [ - col for col, avg_len in sorted(list(avg_column_length.items()), key=lambda x: (x[1], x[0])) - ] - elif self.params.column_order is ColumnOrder.SORT_BY_NAME_DESCENDING: - self.ordered_cols: List[str] = sorted(cols, reverse=True) - elif self.params.column_order is ColumnOrder.SORT_BY_NAME_ASCENDING: - self.ordered_cols: List[str] = sorted(cols) - elif self.params.column_order is ColumnOrder.INPUT_ORDER: - self.ordered_cols: List[str] = self.params.input_ordering - else: - self.ordered_cols = None - - def _transform_single(self, data: List[Any]) -> str: - """Concatanate a list of data of any type""" - return self.params.sep.join([str(x) for x in data if not is_null(x)]) - - def _transform_df(self, data: ScalableDataFrame) -> ScalableSeries: - if self.ordered_cols is None: - raise self.FitBeforeTransformError - output_series: Optional[ScalableSeries] = None - for col in self.ordered_cols: - if col not in data.columns_set: - if self.params.allow_missing: - continue - raise ValueError( - f"Column {col} is required but not found in input data. Input data has columns: {data.columns}" - ) - to_add_col = col + self.params.prefix_col_sep - if self.params.prefix_col_name is False: - to_add_col = "" - if output_series is None: - output_series: ScalableSeries = to_add_col + data[col].fillna(String.EMPTY).astype(str) - else: - output_series += self.params.sep + to_add_col + data[col].fillna(String.EMPTY).astype(str) - return output_series diff --git a/src/fmcore/data/processor/text/__init__.py b/src/fmcore/data/processor/text/__init__.py deleted file mode 100644 index 955bbc9..0000000 --- a/src/fmcore/data/processor/text/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from fmcore.data.processor.text.CaseTransformation import * -from fmcore.data.processor.text.HtmlTagRemoval import * -from fmcore.data.processor.text.PunctuationCleaner import * -from fmcore.data.processor.text.RegexSubstitution import * -from fmcore.data.processor.text.StringRemoval import * -from fmcore.data.processor.text.TextConcatenation import * -from fmcore.data.processor.text.TFIDFVectorization import * diff --git a/src/fmcore/data/processor/vector/VectorAssembler.py b/src/fmcore/data/processor/vector/VectorAssembler.py deleted file mode 100644 index 23b0701..0000000 --- a/src/fmcore/data/processor/vector/VectorAssembler.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import * - -import numpy as np -from scipy.sparse import csr_matrix as SparseCSRMatrix - -from fmcore.constants import MLType -from fmcore.data.processor import Nto1ColumnProcessor, VectorAssemblerInputProcessor, VectorOutputProcessor -from fmcore.data.sdf import ScalableDataFrame, ScalableSeries -from fmcore.util import AutoEnum, as_list, auto, is_null - - -class InvalidBehavior(AutoEnum): - ERROR = auto() - KEEP = auto() - - -class VectorAssembler(Nto1ColumnProcessor, VectorAssemblerInputProcessor, VectorOutputProcessor): - """ - Concatenates multiple columns into a single vector column - - Params: - - HANDLE_INVALID: how to handle NaN values in columns - - ERROR: Throws an error if invalid data/NaN value is present - - KEEP: Keeps all the rows, ignores NaNs and Nones. - """ - - class Params(Nto1ColumnProcessor.Params): - handle_invalid: InvalidBehavior = InvalidBehavior.KEEP - - def _transform_df(self, data: ScalableDataFrame) -> ScalableSeries: - output_series: Optional[ScalableSeries] = None - for col in sorted(list(data.columns)): - if output_series is None: - output_series: ScalableSeries = self._transform_series(data[col], col) - else: - output_series += self._transform_series(data[col], col) - return output_series - - def _transform_series(self, data: ScalableSeries, col: str) -> ScalableSeries: - feature_type: MLType = self.data_schema[col] - if feature_type in {MLType.INT, MLType.FLOAT, MLType.VECTOR}: - return data.apply(self._convert_to_list, col=col) - elif feature_type is MLType.SPARSE_VECTOR: - return data.apply(self._convert_sparse_vector_to_dense_vector, col=col) - else: - raise TypeError(f"{col} Column must be of type {self.input_mltypes}; found {feature_type}") - - def _convert_sparse_vector_to_dense_vector(self, vector: SparseCSRMatrix, col: str): - if isinstance(vector, SparseCSRMatrix): - dense_vector: np.ndarray = vector.toarray()[0] - else: - if self.params.handle_invalid is InvalidBehavior.ERROR: - raise ValueError( - f'Expected only SparseCSRMatrix in column "{col}", got a value of type {type(vector)}' - ) - dense_vector: Optional[np.ndarray] = None - return self._convert_to_list(dense_vector, col) - - def _convert_to_list(self, val: Optional[Union[np.ndarray, List, Set, Tuple, Any]], col: str): - ## Assumes the length of vectors are same throughout the column. - if is_null(val) and self.params.handle_invalid is InvalidBehavior.ERROR: - raise ValueError(f'Got empty value ({val}) in column: "{col}"') - return as_list(val) diff --git a/src/fmcore/data/processor/vector/VectorDensifier.py b/src/fmcore/data/processor/vector/VectorDensifier.py deleted file mode 100644 index eba2983..0000000 --- a/src/fmcore/data/processor/vector/VectorDensifier.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import * - -import numpy as np -from scipy.sparse import csr_matrix as SparseCSRMatrix - -from fmcore.data.processor import SingleColumnProcessor, SparseVectorInputProcessor, VectorOutputProcessor -from fmcore.util import is_null - - -class VectorDensifier(SingleColumnProcessor, SparseVectorInputProcessor, VectorOutputProcessor): - """Converts a sparse vector column into a dense vector column. Each dense vector is a 1d numpy array.""" - - class Params(SingleColumnProcessor.Params): - output_list: bool = False - - def transform_single(self, data: SparseCSRMatrix) -> Optional[np.ndarray]: - if is_null(data): - return None - if not isinstance(data, SparseCSRMatrix): - raise ValueError(f"{str(self.__class__)} can only densify SparseCSRMatrix objects") - data: np.ndarray = data.toarray() - if len(data.shape) != 2: - raise ValueError( - f"Each SparseCSRMatrix to densify must have two dimensions. Found: {len(data.shape)} dims" - ) - if data.shape[0] != 1: - raise ValueError( - f"Each SparseCSRMatrix to densify must have exactly 1 row. Found: {data.shape[0]} rows" - ) - data: np.ndarray = data[0] - if self.params.output_list: - data: List = list(data) - return data diff --git a/src/fmcore/data/processor/vector/__init__.py b/src/fmcore/data/processor/vector/__init__.py deleted file mode 100644 index 0719973..0000000 --- a/src/fmcore/data/processor/vector/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from fmcore.data.processor.vector.VectorAssembler import * -from fmcore.data.processor.vector.VectorDensifier import * diff --git a/src/fmcore/data/reader.py b/src/fmcore/data/reader.py new file mode 100644 index 0000000..ab458de --- /dev/null +++ b/src/fmcore/data/reader.py @@ -0,0 +1 @@ +## Custom Readers diff --git a/src/fmcore/data/writer.py b/src/fmcore/data/writer.py new file mode 100644 index 0000000..8a8e5b0 --- /dev/null +++ b/src/fmcore/data/writer.py @@ -0,0 +1,81 @@ +## Custom Writers +import io +from typing import Callable, Dict, List, NoReturn, Optional, Union + +import pandas as pd +from bears.core.frame.DaskScalableDataFrame import DaskScalableDataFrame +from bears.core.frame.ScalableDataFrame import ScalableDataFrame +from bears.writer.dataframe.DataFrameWriter import DataFrameWriter +from pydantic import ConfigDict, model_validator + +from fmcore.constants import FileContents, FileFormat, Storage, Task +from fmcore.framework._metric import Metric +from fmcore.framework._predictions import Predictions + + +class MetricsWriter(DataFrameWriter): + aliases = ["MetricDataFrameWriter"] ## Backward compatibility + file_formats = [FileFormat.METRICS_JSONLINES] + dask_multiple_write_file_suffix = ".part" ## github.com/dask/dask/issues/9044 + file_contents = [FileContents.METRICS_DATAFRAME] + + class Params(DataFrameWriter.Params): + task: Task + metrics: List[Metric] = [] + + @model_validator(mode="before") + @classmethod + def convert_params(cls, params: Dict): + metric_list: List[Metric] = [ + Metric.of( + metric_dict.get("metric_name"), + **metric_dict.get("metric_params", {}), + ) + for metric_dict in params.get("metrics_list") + ] + params["metrics"] = metric_list + return params + + model_config = ConfigDict( + extra="ignore", + ) + + @model_validator(mode="before") + @classmethod + def convert_params(cls, params: Dict): + params["params"] = cls._convert_params(cls.Params, params) + return params + + def _write_sdf( + self, + destination: Union[io.IOBase, str], + sdf: ScalableDataFrame, + storage: Storage, + **kwargs, + ) -> NoReturn: + PredictionsClass: Predictions = Predictions.get_subclass(self.params.task) + predictions: Predictions = PredictionsClass.from_dataframe(data=sdf, data_schema=self.data_schema) + + evaluated_metrics: List[Metric] = [ + Metric.of(**metric).evaluate(predictions) for metric in self.params.metrics + ] + + metrics_df: pd.DataFrame = pd.DataFrame( + { + evaluated_metric.display_name: [evaluated_metric.aiw_format] + for evaluated_metric in evaluated_metrics + } + ) + metrics_df = metrics_df[sorted(metrics_df.columns)] + metrics_df.to_json(path_or_buf=destination, orient="records") + + def _write_dask_sdf( + self, + destination: Union[io.IOBase, str], + sdf: DaskScalableDataFrame, + storage: Storage, + is_dir: bool, + name_function: Optional[Callable[[int], str]] = None, + **kwargs, + ) -> NoReturn: + self._write_sdf(destination=destination, sdf=sdf, storage=storage, **kwargs) diff --git a/src/fmcore/framework/__init__.py b/src/fmcore/framework/__init__.py index dbee306..1b3a46f 100644 --- a/src/fmcore/framework/__init__.py +++ b/src/fmcore/framework/__init__.py @@ -1,10 +1,12 @@ -from fmcore.framework.task_data import * -from fmcore.framework.predictions import * -from fmcore.framework.algorithm import * -from fmcore.framework.metric import * -from fmcore.framework.tracker import * -from fmcore.framework.visualize import * -from fmcore.framework.trainer import * -from fmcore.framework.evaluator import * -from fmcore.framework.task import * -from fmcore.framework.chain import * +from fmcore.framework.dl import * +from fmcore.framework._metric import * +from fmcore.framework._task_mixins import * +from fmcore.framework._dataset import * +from fmcore.framework._predictions import * +from fmcore.framework._algorithm import * +from fmcore.framework._tracker import * +from fmcore.framework._visualize import * +from fmcore.framework._trainer import * +from fmcore.framework._evaluator import * +from fmcore.framework._chain import * +from fmcore.framework._task import * diff --git a/src/fmcore/framework/algorithm.py b/src/fmcore/framework/_algorithm.py similarity index 96% rename from src/fmcore/framework/algorithm.py rename to src/fmcore/framework/_algorithm.py index 577ff3e..7bec3a2 100644 --- a/src/fmcore/framework/algorithm.py +++ b/src/fmcore/framework/_algorithm.py @@ -2,20 +2,26 @@ import pickle import tempfile from abc import ABC, abstractmethod -from typing import * +from typing import ( + Any, + Callable, + ClassVar, + Dict, + Generator, + Iterator, + List, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np -from pydantic import Extra, conint, root_validator - -from fmcore.constants import DataSplit, MLType, MLTypeSchema -from fmcore.data import FileMetadata, ScalableDataFrame -from fmcore.framework import Dataset, Datasets, Predictions -from fmcore.framework.metric import Metric, Metrics -from fmcore.framework.mixins import TaskOrStr, TaskRegistryMixin -from fmcore.util import ( +from bears import FileMetadata, ScalableDataFrame +from bears.util import ( FractionalBool, Log, - MutableParameters, Parameters, Registry, Schema, @@ -35,6 +41,13 @@ resolve_fractional_bool, safe_validate_arguments, ) +from pydantic import ConfigDict, conint, model_validator + +from fmcore.constants import DataSplit, MLType, MLTypeSchema +from ._metric import Metric, Metrics +from ._task_mixins import TaskOrStr, TaskRegistryMixin +from ._predictions import Predictions +from ._dataset import Dataset, Datasets MODEL_PARAMS_FILE_NAME: str = "__model_params__.pkl" @@ -60,8 +73,10 @@ class Algorithm(TaskRegistryMixin, Registry, ABC): default_batching_params: ClassVar[Dict[str, Any]] = {} - class Config(MutableParameters.Config): - extra = Extra.allow ## Mutable+Extra = allows dynamically adding new items. + model_config = ConfigDict( + ## Mutable+Extra = allows dynamically adding new items. + extra="allow", + ) @classmethod def _pre_registration_hook(cls): @@ -115,10 +130,12 @@ class Hyperparameters(Parameters): epochs: Optional[conint(ge=1)] = None ## Number of epochs to train. None allows inference-only models steps: Optional[conint(ge=1)] = None ## Number of steps to train. None allows inference-only models - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def check_params(cls, params: Dict) -> Dict: if all_are_not_none(params.get("epochs"), params.get("steps")): raise ValueError("Must pass at most one of `epochs` and `steps`; both were passed.") @@ -151,8 +168,10 @@ def create_hyperparams(cls, hyperparams: Optional[Dict] = None) -> Hyperparamete hyperparams: Dict = get_default(hyperparams, {}) return cls.Hyperparameters(**hyperparams) - @root_validator(pre=False) - def convert_params(cls, params: Dict): + @model_validator(mode="before") + @classmethod + def convert_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) ## This allows us to create a new Algorithm instance without specifying `hyperparams`. ## If it is specified, we will pick cls.Hyperparameters, which can be overridden by the subclass. params.setdefault("hyperparams", {}) @@ -368,7 +387,7 @@ def train( """ if datasets is None and "dataset" in kwargs: datasets = kwargs["dataset"] - from fmcore.framework.trainer.Trainer import Trainer + from fmcore.framework._trainer.Trainer import Trainer if not isinstance(datasets, Datasets): datasets: Datasets = Datasets.of(train=datasets) diff --git a/src/fmcore/framework/chain/Chain.py b/src/fmcore/framework/_chain/Chain.py similarity index 98% rename from src/fmcore/framework/chain/Chain.py rename to src/fmcore/framework/_chain/Chain.py index f79542f..74e9b6b 100644 --- a/src/fmcore/framework/chain/Chain.py +++ b/src/fmcore/framework/_chain/Chain.py @@ -7,21 +7,27 @@ from abc import ABC, abstractmethod from datetime import datetime from functools import partial -from typing import * +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + NoReturn, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np -from pydantic import Extra, confloat, conint, constr, root_validator -from pydantic.typing import Literal - -from fmcore.constants import COMPLETED_STATUSES, Parallelize, Status -from fmcore.framework.tracker import Tracker -from fmcore.util import ( +from bears.util import ( Alias, Executor, FunctionSpec, Future, MutableParameters, - Parameters, ProgressBar, Registry, String, @@ -42,7 +48,11 @@ stop_executor, type_str, ) -from fmcore.util.notify import Notifier +from bears.util.notify import Notifier +from pydantic import ConfigDict, confloat, conint, constr, model_validator + +from fmcore.constants import COMPLETED_STATUSES, Parallelize, Status +from fmcore.framework._tracker import Tracker Step = "Step" StepExecution = "StepExecution" @@ -59,8 +69,9 @@ class Step(MutableParameters, Registry, ABC): tracker: Optional[Tracker] = None verbosity: conint(ge=0) = 1 - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) @classmethod def _pre_registration_hook(cls): @@ -180,7 +191,8 @@ def of(cls, *steps, **kwargs) -> Chain: raise ValueError(f"You must pass at least one step when calling {cls.class_name}.of(...)") return cls(steps=steps, **kwargs) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_chain_params(cls, params: Dict) -> Dict: set_param_from_alias(params, param="verbosity", alias=["verbose"]) params["steps"]: List[Step] = [Step.of(step) for step in params["steps"]] @@ -768,8 +780,10 @@ class ChainExecution(MutableParameters): _executor: Optional[Executor] = None error: Optional[Exception] = None - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def _chain_execution_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) if params.get("uuid") is None: chain_template: Chain = params["chain_template"] uuid_dict: Dict = dict( diff --git a/src/fmcore/framework/_chain/__init__.py b/src/fmcore/framework/_chain/__init__.py new file mode 100644 index 0000000..9aa5c25 --- /dev/null +++ b/src/fmcore/framework/_chain/__init__.py @@ -0,0 +1 @@ +from fmcore.framework._chain.Chain import * \ No newline at end of file diff --git a/src/fmcore/framework/task_data.py b/src/fmcore/framework/_dataset.py similarity index 96% rename from src/fmcore/framework/task_data.py rename to src/fmcore/framework/_dataset.py index a630702..dc5a98a 100644 --- a/src/fmcore/framework/task_data.py +++ b/src/fmcore/framework/_dataset.py @@ -1,13 +1,10 @@ from abc import ABC -from typing import * +from typing import ClassVar, Dict, List, NoReturn, Optional, Set, TypeVar, Union -from pydantic import root_validator - -from fmcore.constants import FILE_FORMAT_TO_FILE_ENDING_MAP, DataSplit, FileFormat, MLTypeSchema -from fmcore.data import FileMetadata -from fmcore.data.sdf import ScalableDataFrame, ScalableOrRaw -from fmcore.framework.mixins import InputOutputDataMixin, SchemaValidationError, TaskOrStr -from fmcore.util import ( +from bears import FileMetadata +from bears.core.frame import ScalableDataFrame, ScalableOrRaw +from bears.reader import DataFrameReader, JsonReader +from bears.util import ( Parameters, Registry, Schema, @@ -21,6 +18,11 @@ run_concurrent, safe_validate_arguments, ) +from bears.writer import DataFrameWriter, JsonWriter +from pydantic import model_validator + +from fmcore.constants import FILE_FORMAT_TO_FILE_ENDING_MAP, DataSplit, FileFormat, MLTypeSchema +from fmcore.framework._task_mixins import InputOutputDataMixin, SchemaValidationError, TaskOrStr Dataset = "Dataset" Visualization = "Visualization" @@ -75,7 +77,8 @@ def concat( # from datasets import load_dataset # data = to_sdf(load_dataset(name, **kwargs)) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_dataset_params(cls, params: Dict) -> Dict: data_schema: Union[Schema, MLTypeSchema] = params["data_schema"] if isinstance(data_schema, dict): @@ -155,7 +158,7 @@ def visualize( name: Optional[str] = None, **kwargs, ) -> Visualization: - from fmcore.framework.visualize import Visualization + from fmcore.framework._visualize import Visualization ## Should show an interactive plot. return Visualization.plot(data=self, name=name, **kwargs) @@ -173,7 +176,8 @@ def of(cls, **datasets) -> Datasets: except Exception as e: raise ValueError(f"Error creating {cls.class_name}:\n{String.format_exception_msg(e)}") - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_datasets(cls, params: Dict): name: Optional[str] = None if params.get("name") is not None: @@ -228,8 +232,6 @@ def read(self, **kwargs) -> Datasets: ) -TaskData = Dataset - DATASET_PARAMS_SAVE_FILE_NAME: str = "dataset-metadata" DATASET_PARAMS_SAVE_FILE_ENDING: str = ".dataset-params.json" @@ -244,8 +246,6 @@ def load_dataset( if dataset_source is None: return ## Don't want to mistake with similar params used for prediction: - from fmcore.data.reader import DataFrameReader, JsonReader - dataset_source: FileMetadata = FileMetadata.of(dataset_source) reader: DataFrameReader = DataFrameReader.of( dataset_source.format, @@ -348,8 +348,6 @@ def save_dataset( overwrite: bool = True, **kwargs, ) -> NoReturn: - from fmcore.data.writer import DataFrameWriter, JsonWriter - if any_are_none(dataset, dataset_destination): return dataset_destination: FileMetadata = FileMetadata.of(dataset_destination) diff --git a/src/fmcore/framework/evaluator/AccelerateEvaluator.py b/src/fmcore/framework/_evaluator/AccelerateEvaluator.py similarity index 97% rename from src/fmcore/framework/evaluator/AccelerateEvaluator.py rename to src/fmcore/framework/_evaluator/AccelerateEvaluator.py index 7880751..2d92002 100644 --- a/src/fmcore/framework/evaluator/AccelerateEvaluator.py +++ b/src/fmcore/framework/_evaluator/AccelerateEvaluator.py @@ -1,12 +1,12 @@ -from typing import * - -from pydantic import conint, root_validator +from typing import ( + Dict, + List, + Optional, + Union, +) -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.evaluator.LocalEvaluator import LocalEvaluator -from fmcore.framework.task import GenerativeLM, LanguageModelTaskMixin -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( EnvUtil, FileSystemUtil, as_list, @@ -15,6 +15,11 @@ set_param_from_alias, type_str, ) +from pydantic import conint, model_validator + +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._evaluator.LocalEvaluator import LocalEvaluator +from fmcore.framework._task import GenerativeLM, LanguageModelTaskMixin with optional_dependency("accelerate", "torch", "transformers"): import torch @@ -32,7 +37,8 @@ class AccelerateEvaluator(LocalEvaluator): no_split_module_classes: Optional[List[str]] = None use_hf_from_pretrained: bool = False - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_accelerate_evaluator_params(cls, params: Dict) -> Dict: set_param_from_alias( params, param="model_weights_dtype", alias=["weights_dtype", "model_dtype", "torch_dtype"] diff --git a/src/fmcore/framework/evaluator/Evaluator.py b/src/fmcore/framework/_evaluator/Evaluator.py similarity index 96% rename from src/fmcore/framework/evaluator/Evaluator.py rename to src/fmcore/framework/_evaluator/Evaluator.py index 9f7df34..7162067 100644 --- a/src/fmcore/framework/evaluator/Evaluator.py +++ b/src/fmcore/framework/_evaluator/Evaluator.py @@ -3,18 +3,21 @@ import math from abc import ABC, abstractmethod from copy import deepcopy -from typing import * +from typing import ( + Any, + ClassVar, + Dict, + List, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np -from pydantic import Extra, confloat, conint, root_validator - -from fmcore.constants import _LIBRARY_NAME, Storage, Task -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm, TaskOrStr -from fmcore.framework.metric import Metric -from fmcore.framework.predictions import Predictions, save_predictions -from fmcore.framework.tracker import DEFAULT_TRACKER_PARAMS, Tracker -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( Alias, FractionalBool, Log, @@ -35,7 +38,15 @@ start_daemon, stop_daemon, ) -from fmcore.util.aws import S3Util +from bears.util.aws import S3Util +from pydantic import ConfigDict, confloat, conint, model_validator + +from fmcore import _LIBRARY_NAME +from fmcore.constants import Storage, Task +from fmcore.framework._algorithm import Algorithm, TaskOrStr +from fmcore.framework._metric import Metric +from fmcore.framework._predictions import Predictions, save_predictions +from fmcore.framework._tracker import DEFAULT_TRACKER_PARAMS, Tracker Evaluator = "Evaluator" @@ -46,12 +57,14 @@ class Evaluator(MutableParameters, Registry, ABC): ) _allow_subclass_override: ClassVar[bool] = True ## Allows replacement of subclass with same name. - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) class RunConfig(Parameters): - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) task: Optional[TaskOrStr] = None AlgorithmClass: Optional[Union[Type[Algorithm], str]] = None @@ -75,7 +88,8 @@ class Config(Parameters.Config): ## Logging verbosity. 0 = zero logging, 1 = Basic logging, 2 = verbose logging, 3 = super verbose logging. verbosity: conint(ge=0) = 1 - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def evaluator_params(cls, params: Dict): Alias.set_AlgorithmClass(params) Alias.set_model_dir(params) diff --git a/src/fmcore/framework/evaluator/LocalEvaluator.py b/src/fmcore/framework/_evaluator/LocalEvaluator.py similarity index 91% rename from src/fmcore/framework/evaluator/LocalEvaluator.py rename to src/fmcore/framework/_evaluator/LocalEvaluator.py index e6201e9..8195437 100644 --- a/src/fmcore/framework/evaluator/LocalEvaluator.py +++ b/src/fmcore/framework/_evaluator/LocalEvaluator.py @@ -1,15 +1,24 @@ import gc import os from functools import partial -from typing import * +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Union, +) -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.evaluator.Evaluator import Evaluator -from fmcore.framework.metric import Metric -from fmcore.framework.predictions import Predictions -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.util import Timeout, Timeout24Hr, Timer, confloat, get_default, safe_validate_arguments +from bears import FileMetadata +from bears.util import Timeout, Timeout24Hr, Timer, confloat, get_default, safe_validate_arguments + +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._evaluator.Evaluator import Evaluator +from fmcore.framework._metric import Metric +from fmcore.framework._predictions import Predictions +from fmcore.framework._tracker.Tracker import Tracker class LocalEvaluator(Evaluator): diff --git a/src/fmcore/framework/evaluator/RayEvaluator.py b/src/fmcore/framework/_evaluator/RayEvaluator.py similarity index 97% rename from src/fmcore/framework/evaluator/RayEvaluator.py rename to src/fmcore/framework/_evaluator/RayEvaluator.py index ac7db58..bd36c93 100644 --- a/src/fmcore/framework/evaluator/RayEvaluator.py +++ b/src/fmcore/framework/_evaluator/RayEvaluator.py @@ -6,27 +6,21 @@ import warnings from contextlib import ExitStack, contextmanager from functools import partial -from typing import * - -from pydantic import Extra, confloat, conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import ( - FILE_FORMAT_TO_FILE_ENDING_MAP, - REMOTE_STORAGES, - DataLayout, - FailureAction, - Storage, +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + Union, ) -from fmcore.data import FileMetadata, ScalableDataFrame -from fmcore.data.sdf import DaskScalableDataFrame -from fmcore.framework.evaluator.Evaluator import Evaluator, save_predictions -from fmcore.framework.metric import Metric -from fmcore.framework.predictions import Predictions, load_predictions -from fmcore.framework.task_data import Dataset -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.util import ( - AutoEnum, + +from autoenum import AutoEnum, auto +from bears import FileMetadata +from bears.core.frame import DaskScalableDataFrame, ScalableDataFrame +from bears.util import ( LoadBalancingStrategy, ProgressBar, RayActorComposite, @@ -37,7 +31,6 @@ Timer, accumulate, as_list, - auto, get_default, get_result, ignore_all_output, @@ -51,8 +44,22 @@ set_param_from_alias, wait, ) -from fmcore.util.aws import S3Util -from fmcore.util.language._import import _IS_RAY_INSTALLED +from bears.util.aws import S3Util +from bears.util.language._import import _IS_RAY_INSTALLED +from pydantic import ConfigDict, confloat, conint, model_validator + +from fmcore.constants import ( + FILE_FORMAT_TO_FILE_ENDING_MAP, + REMOTE_STORAGES, + DataLayout, + FailureAction, + Storage, +) +from fmcore.framework._dataset import Dataset +from fmcore.framework._evaluator.Evaluator import Evaluator, save_predictions +from fmcore.framework._metric import Metric +from fmcore.framework._predictions import Predictions, load_predictions +from fmcore.framework._tracker.Tracker import Tracker RayEvaluator = "RayEvaluator" if _IS_RAY_INSTALLED: @@ -109,7 +116,7 @@ def __init__( request_counter: RequestCounter, verbosity: int, ): - from fmcore.framework import Evaluator + from fmcore.framework._evaluator import Evaluator self.verbosity = verbosity self.evaluator: Optional[Evaluator] = ( @@ -151,10 +158,11 @@ def evaluate_shard( from concurrent.futures._base import Future import pandas as pd + from bears import FileMetadata + from bears.util import accumulate - from fmcore.data import FileMetadata - from fmcore.framework import Dataset, Predictions - from fmcore.util import accumulate + from fmcore.framework._dataset import Dataset + from fmcore.framework._predictions import Predictions ## Stops Pandas SettingWithCopyWarning in output. Ref: https://stackoverflow.com/a/20627316 pd.options.mode.chained_assignment = None @@ -298,8 +306,9 @@ def evaluate_shard( class RayEvaluator(Evaluator): aliases = ["ray"] - class Config(Evaluator.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) class RunConfig(Evaluator.RunConfig): ray_init: RayInitConfig = {} @@ -315,7 +324,8 @@ class RunConfig(Evaluator.RunConfig): ## By default, do not cache the model: cache_timeout: Optional[Union[Timeout, confloat(gt=0)]] = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def ray_evaluator_params(cls, params: Dict) -> Dict: set_param_from_alias(params, param="nested_evaluator_name", alias=["nested_evaluator"]) set_param_from_alias( diff --git a/src/fmcore/framework/_evaluator/__init__.py b/src/fmcore/framework/_evaluator/__init__.py new file mode 100644 index 0000000..492928b --- /dev/null +++ b/src/fmcore/framework/_evaluator/__init__.py @@ -0,0 +1,4 @@ +from fmcore.framework._evaluator.Evaluator import * +from fmcore.framework._evaluator.LocalEvaluator import * +from fmcore.framework._evaluator.AccelerateEvaluator import * +from fmcore.framework._evaluator.RayEvaluator import * \ No newline at end of file diff --git a/src/fmcore/framework/metric.py b/src/fmcore/framework/_metric.py similarity index 97% rename from src/fmcore/framework/metric.py rename to src/fmcore/framework/_metric.py index a9d7caa..86633fd 100644 --- a/src/fmcore/framework/metric.py +++ b/src/fmcore/framework/_metric.py @@ -1,16 +1,22 @@ import copy import math from abc import ABC -from functools import singledispatchmethod -from typing import * +from typing import ( + Any, + ClassVar, + Dict, + Generator, + List, + Optional, + Tuple, + Type, + Union, +) import numpy as np import pandas as pd -from pydantic import conint, constr, root_validator - -from fmcore.constants import AggregationStrategy, DataSplit, MLType, Parallelize -from fmcore.data.sdf import ScalableDataFrame -from fmcore.util import ( +from bears import ScalableDataFrame +from bears.util import ( Alias, MutableParameters, Parameters, @@ -32,10 +38,13 @@ safe_validate_arguments, set_param_from_alias, ) -from fmcore.util.language._import import _check_is_ray_installed +from bears.util.language._import import _check_is_ray_installed +from pydantic import conint, constr, model_validator + +from fmcore.constants import AggregationStrategy, DataSplit, MLType, Parallelize -Metric = "Metric" Metrics = "Metrics" +Metric = "Metric" class MetricEvaluationError(Exception): @@ -52,11 +61,6 @@ class Metric(MutableParameters, Registry): _allow_subclass_override = True required_assets: ClassVar[Tuple[MLType, ...]] = () - class Config(MutableParameters.Config): - keep_untouched = (singledispatchmethod,) - ## Ref of validating set calls: https://docs.pydantic.dev/1.10/usage/model_config/ - validate_assignment = True - class Params(Parameters): """ BaseModel for parameters. Expected to be overridden by subclasses. @@ -82,7 +86,8 @@ class Params(Metric.Params): max_workers: Optional[int] = None max_retries: int = 1 ## Do not retry - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_alias(cls, params: Dict) -> Dict: set_param_from_alias(params, param="display_decimals", alias=["decimals", "rounding", "round"]) return params @@ -115,7 +120,8 @@ def _pre_registration_hook(cls): f"`update` and `compute` functions together; at present, only `compute` is implemented." ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def convert_params(cls, params: Dict): params["params"] = super(Metric, cls)._convert_params(cls.Params, params.get("params")) params["name"] = cls.class_name @@ -316,7 +322,8 @@ class AggregatedPercentageMetric(PercentageMetric, ABC): class Params(PercentageMetric.Params): aggregation: AggregationStrategy - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_aliases(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -421,7 +428,8 @@ def of(cls, metrics: Optional[Union[Metrics, Dict]] = None, **metrics_kwargs) -> except Exception as e: raise ValueError(f"Failed to create Metrics object.\nnError: {String.format_exception_msg(e)}") - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_metrics(cls, params: Dict): metrics_dict: Dict[DataSplit, List[Metric]] = {} for data_split, metrics in params["metrics"].items(): @@ -697,10 +705,6 @@ def _evaluate_rolling_metric( raise MetricEvaluationError(error_msg) -MetricsCollection = Metrics -Metrics.update_forward_refs() - - def metric_stats_str( metric_display_name: str, metric_stats: Dict[str, Union[int, float, Dict]], diff --git a/src/fmcore/framework/predictions.py b/src/fmcore/framework/_predictions.py similarity index 97% rename from src/fmcore/framework/predictions.py rename to src/fmcore/framework/_predictions.py index 742f118..5dd2c11 100644 --- a/src/fmcore/framework/predictions.py +++ b/src/fmcore/framework/_predictions.py @@ -1,24 +1,18 @@ from abc import ABC from copy import deepcopy -from typing import * +from typing import ClassVar, Dict, List, Literal, NoReturn, Optional, Set, TypeVar, Union import pandas as pd -from pydantic import root_validator -from pydantic.typing import Literal - -from fmcore.constants import ( - FILE_FORMAT_TO_FILE_ENDING_MAP, - DataLayout, - DataSplit, - FileFormat, - MLTypeSchema, - TaskOrStr, +from bears import FileMetadata +from bears.core.frame import ( + ScalableDataFrame, + ScalableOrRaw, + ScalableSeries, + ScalableSeriesOrRaw, + is_scalable, ) -from fmcore.data.FileMetadata import FileMetadata -from fmcore.data.sdf import ScalableDataFrame, ScalableOrRaw, ScalableSeries, ScalableSeriesOrRaw, is_scalable -from fmcore.framework.mixins import InputOutputDataMixin, SchemaValidationError -from fmcore.framework.task_data import Dataset -from fmcore.util import ( +from bears.reader import DataFrameReader, JsonReader +from bears.util import ( Registry, Schema, SchemaTemplate, @@ -31,6 +25,19 @@ run_concurrent, safe_validate_arguments, ) +from bears.writer import DataFrameWriter, JsonWriter +from pydantic import model_validator + +from fmcore.constants import ( + FILE_FORMAT_TO_FILE_ENDING_MAP, + DataLayout, + DataSplit, + FileFormat, + MLTypeSchema, + TaskOrStr, +) +from fmcore.framework._task_mixins import InputOutputDataMixin, SchemaValidationError +from fmcore.framework._dataset import Dataset Predictions = "Predictions" Visualization = "Visualization" @@ -59,7 +66,8 @@ def _pre_registration_hook(cls): predictions_schema_template=cls.predictions_schema, ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_predictions_params(cls, params: Dict) -> Dict: params["data_schema"]: Schema = Schema.of(params["data_schema"], schema_template=cls.schema_template) # data_schema: Union[Schema, MLTypeSchema] = params['data_schema'] @@ -389,7 +397,7 @@ def visualize( name: Optional[str] = None, **kwargs, ) -> Visualization: - from fmcore.framework.visualize import Visualization + from fmcore.framework._visualize import Visualization ## Should show an interactive plot. return Visualization.plot(data=self, name=name, **kwargs) @@ -426,8 +434,6 @@ def load_predictions( if predictions_source is None: return ## Don't want to mistake with similar params used for prediction: - from fmcore.data.reader import DataFrameReader, JsonReader - predictions_source: FileMetadata = FileMetadata.of(predictions_source) reader: DataFrameReader = DataFrameReader.of( predictions_source.format, @@ -533,8 +539,6 @@ def save_predictions( overwrite: bool = True, **kwargs, ) -> NoReturn: - from fmcore.data.writer import DataFrameWriter, JsonWriter - if any_are_none(predictions, predictions_destination): return predictions_destination: FileMetadata = FileMetadata.of(predictions_destination) diff --git a/src/fmcore/framework/_task/__init__.py b/src/fmcore/framework/_task/__init__.py new file mode 100644 index 0000000..0da7706 --- /dev/null +++ b/src/fmcore/framework/_task/__init__.py @@ -0,0 +1,7 @@ +from fmcore.framework._task.regression import * +from fmcore.framework._task.classification import * +from fmcore.framework._task.embedding import * +from fmcore.framework._task.text_generation import * +from fmcore.framework._task.retrieval import * +from fmcore.framework._task.dense_retrieval import * +from fmcore.framework._task.sparse_retrieval import * diff --git a/src/fmcore/framework/task/classification.py b/src/fmcore/framework/_task/classification.py similarity index 98% rename from src/fmcore/framework/task/classification.py rename to src/fmcore/framework/_task/classification.py index b904e87..b89d6b4 100644 --- a/src/fmcore/framework/task/classification.py +++ b/src/fmcore/framework/_task/classification.py @@ -1,16 +1,29 @@ from abc import ABC, abstractmethod from functools import partial -from typing import * +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Optional, + Set, + Tuple, + Union, +) import numpy as np import pandas as pd +from bears import ScalableDataFrame, ScalableSeries +from bears.processor import EncodingRange, LabelEncoding +from bears.util import all_are_np_subtypes, as_list, as_tuple, is_list_or_set_like, safe_validate_arguments from pydantic import constr from fmcore.constants import DataLayout, DataSplit, MLType, MLTypeSchema, Task -from fmcore.data import ScalableDataFrame, ScalableSeries -from fmcore.data.processor import EncodingRange, LabelEncoding -from fmcore.framework import Algorithm, Dataset, Metric, Metrics, Predictions -from fmcore.util import all_are_np_subtypes, as_list, as_tuple, is_list_or_set_like, safe_validate_arguments +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._metric import Metric, Metrics +from fmcore.framework._predictions import Predictions class ClassificationData(Dataset): diff --git a/src/fmcore/framework/task/dense_retrieval.py b/src/fmcore/framework/_task/dense_retrieval.py similarity index 94% rename from src/fmcore/framework/task/dense_retrieval.py rename to src/fmcore/framework/_task/dense_retrieval.py index 7b1bcac..f2bceba 100644 --- a/src/fmcore/framework/task/dense_retrieval.py +++ b/src/fmcore/framework/_task/dense_retrieval.py @@ -1,25 +1,18 @@ from abc import abstractmethod -from typing import * +from typing import ( + Any, + ClassVar, + Dict, + List, + Optional, + Tuple, + Type, + Union, +) import numpy as np -from pydantic import root_validator - -from fmcore.constants import DataLayout, DataSplit, FailureAction, MLType, Task -from fmcore.data import FileMetadata, ScalableDataFrame, ScalableSeries, ScalableSeriesRawType -from fmcore.framework import Algorithm, Predictions, load_predictions -from fmcore.framework.task.embedding import EMBEDDINGS_COL, Embedder, EmbeddingData, Embeddings -from fmcore.framework.task.retrieval import ( - RETRIEVAL_FORMAT_MSG, - RETRIEVAL_RANKED_RESULTS_COL, - DistanceMetric, - Queries, - RankedResult, - RankedResults, - RelevanceAnnotation, - RetrievalIndex, - Retriever, -) -from fmcore.util import ( +from bears import FileMetadata, ScalableDataFrame, ScalableSeries, ScalableSeriesRawType +from bears.util import ( INDEX_COL_DEFAULT_NAME, Alias, MappedParameters, @@ -36,8 +29,23 @@ set_param_from_alias, type_str, ) +from pydantic import model_validator -DenseRetrievalIndex = "DenseRetrievalIndex" +from fmcore.constants import DataLayout, DataSplit, FailureAction, MLType, Task +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._predictions import Predictions, load_predictions +from fmcore.framework._task.embedding import EMBEDDINGS_COL, Embedder, EmbeddingData, Embeddings +from fmcore.framework._task.retrieval import ( + RETRIEVAL_FORMAT_MSG, + RETRIEVAL_RANKED_RESULTS_COL, + DistanceMetric, + Queries, + RankedResult, + RankedResults, + RelevanceAnnotation, + RetrievalIndex, + Retriever, +) def _normalize_l2(embeddings: np.ndarray) -> np.ndarray: @@ -93,7 +101,7 @@ class FaissIndexParams(MappedParameters): DistanceMetric.INNER_PRODUCT: faiss.METRIC_INNER_PRODUCT, } - _mapping = append_to_keys( + mapping_dict: ClassVar[Dict[str, Type]] = append_to_keys( prefix="faiss.", d={ "IndexFlatL2": faiss.IndexFlatL2, ## L2 = Euclidean distance @@ -108,7 +116,8 @@ class FaissIndexParams(MappedParameters): ) distance_metric: Optional[DistanceMetric] = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_faiss_index_params(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -139,7 +148,7 @@ def set_faiss_index_params(cls, params: Dict) -> Dict: distance_metric: DistanceMetric = DistanceMetric.from_str(distance_metric) if ( distance_metric is DistanceMetric.COSINE_SIMILARITY - and cls._mapping[index_name] != faiss.IndexFlatIP + and cls.mapping_dict[index_name] != faiss.IndexFlatIP ): raise ValueError( f"When using distance_metric={distance_metric}, " @@ -148,7 +157,7 @@ def set_faiss_index_params(cls, params: Dict) -> Dict: ## Ref: https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances#additional-metrics if ( distance_metric in {DistanceMetric.L1, DistanceMetric.Linf} - and cls._mapping[index_name] != faiss.IndexFlat + and cls.mapping_dict[index_name] != faiss.IndexFlat ): raise ValueError( f"When using distance_metric={distance_metric}, " @@ -163,7 +172,7 @@ def set_faiss_index_params(cls, params: Dict) -> Dict: ] if ( - cls._mapping[index_name] == faiss.IndexFlat + cls.mapping_dict[index_name] == faiss.IndexFlat and distance_metric in cls._DISTANCE_METRICS_TO_FAISS ): args.append(cls._DISTANCE_METRICS_TO_FAISS[distance_metric]) @@ -181,8 +190,10 @@ class FaissRetrievalIndex(DenseRetrievalIndex): doc_id2faiss_idx: Dict[str, int] = {} docs: Dict[str, Dict] = {} - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def set_faiss_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) params["params"] = FaissIndexParams.of(params["params"]) return params @@ -397,7 +408,7 @@ def predict_step( return {"ranked_results": ranked_results} def _embedder_predict(self, query_embedding_data: EmbeddingData, **kwargs) -> Embeddings: - from fmcore.framework.evaluator import Evaluator + from fmcore.framework._evaluator import Evaluator # print('Queries:') # with pd_display() as disp: diff --git a/src/fmcore/framework/task/embedding.py b/src/fmcore/framework/_task/embedding.py similarity index 81% rename from src/fmcore/framework/task/embedding.py rename to src/fmcore/framework/_task/embedding.py index 4f24efc..66e7821 100644 --- a/src/fmcore/framework/task/embedding.py +++ b/src/fmcore/framework/_task/embedding.py @@ -1,12 +1,18 @@ from abc import ABC -from typing import * +from typing import ( + Dict, + List, + Union, +) import numpy as np +from bears import ScalableSeries +from bears.util import is_list_like from fmcore.constants import MLType, Task -from fmcore.data import ScalableSeries -from fmcore.framework import Algorithm, Dataset, Predictions -from fmcore.util import is_list_like +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._predictions import Predictions class EmbeddingData(Dataset): diff --git a/src/fmcore/framework/task/ranking.py b/src/fmcore/framework/_task/ranking.py similarity index 68% rename from src/fmcore/framework/task/ranking.py rename to src/fmcore/framework/_task/ranking.py index 6dad4ed..794f890 100644 --- a/src/fmcore/framework/task/ranking.py +++ b/src/fmcore/framework/_task/ranking.py @@ -1,12 +1,19 @@ from abc import ABC -from typing import * +from typing import ( + Dict, + List, + Union, +) import numpy as np +from bears import ScalableSeries +from bears.util import is_list_like from fmcore.constants import MLType, Task -from fmcore.data import ScalableSeries -from fmcore.framework import Algorithm, Dataset, Predictions -from fmcore.util import is_list_like +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._predictions import Predictions +from fmcore.framework._task.embedding import Embeddings class RankingData(Dataset): @@ -28,8 +35,8 @@ class RankedResults(Predictions): @property def embeddings(self) -> ScalableSeries: - predicted_emebeddings_col: str = next(iter(self.data_schema.predictions_schema.keys())) - return self.data[predicted_emebeddings_col] + predicted_embeddings_col: str = next(iter(self.data_schema.predictions_schema.keys())) + return self.data[predicted_embeddings_col] class Ranker(Algorithm, ABC): diff --git a/src/fmcore/framework/task/regression.py b/src/fmcore/framework/_task/regression.py similarity index 85% rename from src/fmcore/framework/task/regression.py rename to src/fmcore/framework/_task/regression.py index 8a658bf..9a8e19c 100644 --- a/src/fmcore/framework/task/regression.py +++ b/src/fmcore/framework/_task/regression.py @@ -1,12 +1,18 @@ from abc import ABC -from typing import * +from typing import ( + Dict, + List, + Union, +) import numpy as np +from bears import ScalableSeries +from bears.util import is_list_like from fmcore.constants import MLType, Task -from fmcore.data import ScalableSeries -from fmcore.framework import Algorithm, Dataset, Predictions -from fmcore.util import is_list_like +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._predictions import Predictions class RegressionData(Dataset): diff --git a/src/fmcore/framework/task/retrieval.py b/src/fmcore/framework/_task/retrieval.py similarity index 92% rename from src/fmcore/framework/task/retrieval.py rename to src/fmcore/framework/_task/retrieval.py index c930260..b89f75a 100644 --- a/src/fmcore/framework/task/retrieval.py +++ b/src/fmcore/framework/_task/retrieval.py @@ -1,25 +1,35 @@ from abc import ABC, abstractmethod -from typing import * - -from pydantic import Extra, conint, constr, root_validator +from typing import ( + Any, + ClassVar, + Dict, + List, + Optional, + Set, + Type, + Union, +) -from fmcore.constants import DataLayout, MLType, MLTypeSchema, Task -from fmcore.data import FileMetadata, ScalableDataFrame, ScalableSeries, ScalableSeriesRawType -from fmcore.framework import Algorithm, Dataset, Predictions -from fmcore.framework.task.embedding import EmbeddingData -from fmcore.util import ( - AutoEnum, +from autoenum import AutoEnum, auto +from bears import FileMetadata, ScalableDataFrame, ScalableSeries, ScalableSeriesRawType +from bears.util import ( MutableParameters, Parameters, Registry, Schema, String, as_list, - auto, random_sample, safe_validate_arguments, set_param_from_alias, ) +from pydantic import ConfigDict, conint, constr, model_validator + +from fmcore.constants import DataLayout, MLType, MLTypeSchema, Task +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._predictions import Predictions +from fmcore.framework._task.embedding import EmbeddingData RelevanceAnnotation = "RelevanceAnnotation" RankedResult = "RankedResult" @@ -52,7 +62,8 @@ def of(cls, data: Union[Parameters, Dict]) -> RelevanceAnnotation: f"Unsupported type for {RelevanceAnnotation}: {type(data)} with value: {data}" ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_relevance_annotation_params(cls, params: Dict) -> Dict: set_param_from_alias(params, param="document_id", alias=["id", "doc_id"]) set_param_from_alias(params, param="relevance_grade", alias=["grade", "relevance_level", "level"]) @@ -109,9 +120,10 @@ def __repr__(self) -> str: def __str__(self) -> str: return f"{self.class_name} with items:\n{String.pretty(self.dict())}" - class Config(Parameters.Config): - ## Allow extra keyword parameters to be stored in RankedResult. - extra = Extra.allow + model_config = ConfigDict( + ## Allow extra keyword parameters to be stored in RankedResult: + extra="allow", + ) @classmethod def of(cls, data: Union[Parameters, Dict]) -> RankedResult: @@ -121,7 +133,8 @@ def of(cls, data: Union[Parameters, Dict]) -> RankedResult: return cls(**data) raise NotImplementedError(f"Unsupported type for {RankedResult}: {type(data)} with value: {data}") - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_ranked_result_params(cls, params: Dict) -> Dict: set_param_from_alias(params, param="document_id", alias=["id", "doc_id"]) set_param_from_alias(params, param="document", alias=["doc", "asset", "passage"]) @@ -227,8 +240,9 @@ class Retriever(Algorithm, ABC): class RetrievalIndex(MutableParameters, Registry, ABC): index: Optional[Any] = None - class Config(MutableParameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) @classmethod def of( diff --git a/src/fmcore/framework/task/sparse_retrieval.py b/src/fmcore/framework/_task/sparse_retrieval.py similarity index 98% rename from src/fmcore/framework/task/sparse_retrieval.py rename to src/fmcore/framework/_task/sparse_retrieval.py index 155e8a4..f6f0afa 100644 --- a/src/fmcore/framework/task/sparse_retrieval.py +++ b/src/fmcore/framework/_task/sparse_retrieval.py @@ -3,36 +3,35 @@ import multiprocessing as mp from abc import ABC, abstractmethod from collections import Counter -from typing import * +from typing import ( + Any, + Callable, + ClassVar, + Dict, + Generator, + List, + Literal, + NoReturn, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np import pandas as pd -from pydantic import Extra, confloat, conint, root_validator - -from fmcore.constants import DataLayout, DataSplit, MLType, Parallelize -from fmcore.data import ( +from autoenum import AutoEnum, auto +from bears import ( FileMetadata, ScalableDataFrame, ScalableDataFrameRawType, ScalableSeries, ScalableSeriesRawType, ) -from fmcore.framework import Algorithm, Dataset, load_dataset -from fmcore.framework.task.retrieval import ( - RETRIEVAL_FORMAT_MSG, - RETRIEVAL_RANKED_RESULTS_COL, - Queries, - RankedResult, - RankedResults, - RelevanceAnnotation, - RetrievalCorpus, - RetrievalIndex, - Retriever, -) -from fmcore.util import ( +from bears.util import ( INDEX_COL_DEFAULT_NAME, Alias, - AutoEnum, Executor, Log, MappedParameters, @@ -46,7 +45,6 @@ accumulate, accumulate_iter, append_to_keys, - auto, best_k, check_isinstance, dispatch, @@ -60,9 +58,23 @@ type_str, wait, ) -from fmcore.util.language._import import _IS_RAY_INSTALLED +from bears.util.language._import import _IS_RAY_INSTALLED +from pydantic import ConfigDict, confloat, conint, model_validator -SparseRetrievalIndex = "SparseRetrievalIndex" +from fmcore.constants import DataLayout, DataSplit, MLType, Parallelize +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset, load_dataset +from fmcore.framework._task.retrieval import ( + RETRIEVAL_FORMAT_MSG, + RETRIEVAL_RANKED_RESULTS_COL, + Queries, + RankedResult, + RankedResults, + RelevanceAnnotation, + RetrievalCorpus, + RetrievalIndex, + Retriever, +) class SparseRetrievalIndex(RetrievalIndex): @@ -153,8 +165,9 @@ def _tokenize_text(text: Union[str, List[str]], tokenizer: Optional[Callable] = class BM25IndexStore(MutableParameters, ABC): """A single index shard.""" - class Config(MutableParameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) distance_metric: ClassVar[BM25DistanceMetric] @@ -440,7 +453,7 @@ class BM25IndexStoreParams(MappedParameters): indexing_parallelize: Parallelize = Parallelize.sync indexing_max_workers: Optional[conint(ge=1)] = None - _mapping = append_to_keys( + mapping_dict: ClassVar[Dict[str, BM25IndexStore]] = append_to_keys( prefix="BM25", d={ "Okapi": BM25Okapi, @@ -675,8 +688,10 @@ class BM25RetrievalIndex(BM25RetrievalIndexBase): aliases = ["BM25"] index: Optional[BM25IndexStore] = None - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def set_bm25_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) params["params"] = BM25IndexStoreParams.of(params["params"]) return params @@ -878,8 +893,10 @@ class RayBM25RetrievalIndex(BM25RetrievalIndexBase): params: Optional[Union[RayBM25IndexStoreParams, Dict, str]] = None _doc_id_to_doc_cache: Dict[str, Tuple[BM25IndexStoreDoc, str]] = {} - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def set_bm25_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) params["params"] = RayBM25IndexStoreParams.of(params["params"]) return params diff --git a/src/fmcore/framework/task/text_generation.py b/src/fmcore/framework/_task/text_generation.py similarity index 97% rename from src/fmcore/framework/task/text_generation.py rename to src/fmcore/framework/_task/text_generation.py index f2f31c3..fa64f88 100644 --- a/src/fmcore/framework/task/text_generation.py +++ b/src/fmcore/framework/_task/text_generation.py @@ -1,19 +1,12 @@ from abc import ABC, abstractmethod from copy import deepcopy from math import inf, log -from typing import * +from typing import Any, Callable, ClassVar, Dict, List, Literal, Optional, Set, Tuple, Union import numpy as np import pandas as pd -from pydantic import Extra, confloat, conint, constr, root_validator -from pydantic.typing import Literal - -from fmcore.constants import DataLayout, DataSplit, FailureAction, MLType, Task -from fmcore.data import FileMetadata, ScalableDataFrame, ScalableSeries -from fmcore.framework import Algorithm, Dataset, Predictions -from fmcore.framework.task.classification import ClassificationData, Classifier, MultiLabelClassifier -from fmcore.framework.task.retrieval import Queries, RankedResults, Retriever -from fmcore.util import ( +from bears import FileMetadata, ScalableDataFrame, ScalableSeries +from bears.util import ( Alias, MappedParameters, Parameters, @@ -28,6 +21,14 @@ set_param_from_alias, type_str, ) +from pydantic import ConfigDict, confloat, conint, constr, model_validator + +from fmcore.constants import DataLayout, DataSplit, FailureAction, MLType, Task +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._dataset import Dataset +from fmcore.framework._predictions import Predictions +from fmcore.framework._task.classification import ClassificationData, Classifier, MultiLabelClassifier +from fmcore.framework._task.retrieval import Queries, RankedResults, Retriever PROMPT: str = "prompt" PROMPT_TEMPLATE: str = "prompt_template" @@ -82,10 +83,12 @@ class ICLSampler(Parameters): num_shots: conint(ge=0) = 0 shots_sep: constr(min_length=0) = "\n\n" - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_icl_sampler_params(cls, params: Dict) -> Dict: set_param_from_alias( params, @@ -243,8 +246,10 @@ class ClassificationICLSampler(ICLSampler): def _icl_label_col_binarized_entailment(cls, icl_dataset: ClassificationData) -> str: return f"{icl_dataset.ground_truth_label_col_name}_binarized" - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def set_classification_icl_sampler_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) params["label_verbalizer"]: Dict[str, str] = { params["label_normalizer"](lb): lb_description for lb, lb_description in params["label_verbalizer"].items() @@ -526,7 +531,8 @@ class Prompts(Dataset): prompt_template_map_col: Optional[str] = None prompt_template_apply: Optional[Literal["expand", "map"]] = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_prompt_data_params(cls, params: Dict) -> Dict: set_param_from_alias( params, param="prompt_template", alias=["template", "prompt", "prompt_templates"] @@ -581,9 +587,10 @@ class NextTokens(TextGenerationsPredictionsBase): class TextGenerationParams(Parameters, ABC): ## Becomes an anonymous class later - class Config(Parameters.Config): - ## Allow extra keyword parameters to be used when initializing the class. - extra = Extra.allow + model_config = ConfigDict( + ## Allow extra keyword parameters to be used when initializing the class: + extra="allow", + ) strategy: ClassVar[str] @@ -606,8 +613,10 @@ class Config(Parameters.Config): ] = 1e-4 ## Tokens with scores below this tolerance are ignored. Set to None to not filter any tokens. force_vocab_size: bool = True - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def set_gen_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) if params["output_scores_format"] == "probabilities": params["renormalize_logits"]: bool = True params["min_possible_score"]: float = 0.0 @@ -676,7 +685,7 @@ class LogitsProcessorListParams(TextGenerationParams): class TextGenerationParamsMapper(MappedParameters): - _mapping = { + mapping_dict: ClassVar[Dict[Tuple[str, ...], TextGenerationParams]] = { ("GreedyDecoding", "greedy"): GreedyDecodingParams, ("BeamSearch", "beam"): BeamSearchParams, ("TopKSampling", "top_k"): TopKSamplingParams, @@ -757,7 +766,8 @@ class Hyperparameters(Algorithm.Hyperparameters): ## Filters by this column in ICL dataset to this column in the batch of data. icl_filter_col: Optional[constr(min_length=1)] = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_lm_task_params(cls, params: Dict) -> Dict: set_param_from_alias( params, param="lm", alias=["llm", "prompter", "base_llm", "base_model", "base"] @@ -817,7 +827,7 @@ def expanded_prompt_template(self) -> str: return prompt_template def _lm_predict(self, prompts: Prompts, **kwargs) -> NextTokens: - from fmcore.framework.evaluator import Evaluator + from fmcore.framework._evaluator import Evaluator # print('Prompts:') # with pd_display() as disp: @@ -1026,7 +1036,7 @@ def predict_step(self, batch: TextInputs, query_template: Optional[str] = None, return text_generations def _retriever_predict(self, queries: Queries, **kwargs) -> RankedResults: - from fmcore.framework.evaluator import Evaluator + from fmcore.framework._evaluator import Evaluator # print('Queries:') # with pd_display() as disp: @@ -1092,7 +1102,8 @@ class Hyperparameters(LanguageModelTaskMixin.Hyperparameters): label_verbalizer: Dict[str, str] prompt_template_label_key: constr(min_length=1) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def set_zeroshot_params(cls, params: Dict) -> Dict: set_param_from_alias(params, param="entailment_yes_tokens", alias=["yes_tokens"]) set_param_from_alias(params, param="entailment_no_tokens", alias=["no_tokens"]) @@ -1350,9 +1361,7 @@ def _entailment_df_to_predictions( entailment_df.loc[:, batch.data_schema.index_col] = entailment_df[batch.data_schema.index_col].apply( lambda row_idx: str(row_idx).split(PROMPT_TEMPLATE_EXPANDER_SEP)[0] ) - entailment_df_row_gb: pandas.core.groupby.generic.DataFrameGroupBy = entailment_df.groupby( - batch.data_schema.index_col - ) + entailment_df_row_gb: Any = entailment_df.groupby(batch.data_schema.index_col) for row_idx in batch.index(): row_entailment_df: pd.DataFrame = entailment_df.loc[entailment_df_row_gb.groups[str(row_idx)]] if len(row_entailment_df) != len(labelspace): diff --git a/src/fmcore/framework/mixins.py b/src/fmcore/framework/_task_mixins.py similarity index 97% rename from src/fmcore/framework/mixins.py rename to src/fmcore/framework/_task_mixins.py index c168832..0e783a1 100644 --- a/src/fmcore/framework/mixins.py +++ b/src/fmcore/framework/_task_mixins.py @@ -2,33 +2,26 @@ import os from abc import ABC, abstractmethod from copy import deepcopy -from typing import * +from typing import ( + Any, + Callable, + ClassVar, + Dict, + Generator, + List, + Literal, + NoReturn, + Optional, + Set, + Tuple, + Type, + Union, +) import pandas as pd -from pandas.api.types import is_numeric_dtype, is_string_dtype -from pydantic import Extra, confloat, conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import ( - ASSET_ML_TYPES, - AVAILABLE_TENSOR_TYPES, - SHORTHAND_TO_TENSOR_LAYOUT_MAP, - DataLayout, - DataPosition, - DataSplit, - FileFormat, - MLType, - MLTypeOrStr, - MLTypeSchema, - Parallelize, - Task, - TaskOrStr, - TensorShortHand, -) -from fmcore.data.asset import Asset -from fmcore.data.FileMetadata import FileMetadata -from fmcore.data.reader import AssetReader, DataFrameReader -from fmcore.data.sdf import ( +from bears import FileMetadata +from bears.asset import Asset +from bears.core.frame import ( ScalableDataFrame, ScalableDataFrameOrRaw, ScalableOrRaw, @@ -36,8 +29,8 @@ ScalableSeriesOrRaw, TensorScalableSeries, ) -from fmcore.framework.metric import Metric -from fmcore.util import ( +from bears.reader import AssetReader, DataFrameReader +from bears.util import ( Alias, FractionalBool, MutableParameters, @@ -63,6 +56,27 @@ run_concurrent, safe_validate_arguments, ) +from bears.util.language._import import _IS_TORCH_INSTALLED +from pandas.api.types import is_numeric_dtype, is_string_dtype +from pydantic import ConfigDict, confloat, conint, model_validator + +from fmcore.constants import ( + ASSET_ML_TYPES, + AVAILABLE_TENSOR_TYPES, + SHORTHAND_TO_TENSOR_LAYOUT_MAP, + DataLayout, + DataPosition, + DataSplit, + FileFormat, + MLType, + MLTypeOrStr, + MLTypeSchema, + Parallelize, + Task, + TaskOrStr, + TensorShortHand, +) +from fmcore.framework._metric import Metric InputOutputDataMixin = "InputOutputDataMixin" @@ -124,16 +138,14 @@ def get_subclass( concrete_subclasses[Subclass.__class__.__name__] = Subclass return only_item(list(concrete_subclasses.values()), raise_error=False) raise ValueError( - f"Please pass either `key`, `task` or `name` arguments to retrieve subclasses of {cls.class_name}." + f"Please pass either `key`, `task` or `name` arguments to retrieve subclasses of {cls.class_name}" ) class InputOutputDataMixin(TaskRegistryMixin, ABC): - class Config(TaskRegistryMixin.Config): - extra = Extra.ignore - allow_mutation = True ## Mutable to allow caching - ## Ref of validating set calls: https://docs.pydantic.dev/1.10/usage/model_config/ - validate_assignment = True + model_config = ConfigDict( + extra="ignore", + ) schema_template: ClassVar[SchemaTemplate] @@ -187,7 +199,8 @@ def _of( **kwargs, ) - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_data_params(cls, params: Dict) -> Dict: Alias.set_mapper(params) Alias.set_map_apply(params) @@ -215,10 +228,11 @@ def _set_data_params(cls, params: Dict) -> Dict: ) return params - @root_validator( - pre=False - ) ## Runs after any @root_validator(pre=True) on Dataset and Predictions subclasses. + ## Runs after any similar validators on subclasses(Dataset, Predictions, etc). + @model_validator(mode="before") + @classmethod def _validate_data_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) if params.get("validated") is True and isinstance(params.get("data"), ScalableDataFrame): try: cls.validate_schema(params["data_schema"]) @@ -374,13 +388,13 @@ def to_layout(self, layout: DataLayout, **kwargs) -> Any: def update_params(self, **new_params) -> Any: ## Since Parameters class is immutable, we create a new one: - overidden_params: Dict = { + overridden_params: Dict = { **self.dict(exclude={"data", "data_schema"}), "data": self.data, "data_schema": Schema(**self.data_schema.dict()), ## Copy schema **new_params, } - return self._constructor(**overidden_params) + return self._constructor(**overridden_params) def dict(self, *args, **kwargs): ## Ensure we don't call ScalableDataFrame.dict() @@ -408,11 +422,11 @@ def copy(self, deep: bool = False) -> Any: def torch_dataloader( self, *, error: Literal["raise", "warn", "ignore"] = "raise", **kwargs - ) -> Optional["PyTorchTaskDataDataset"]: - with optional_dependency("torch", error=error): - from fmcore.framework.dl.torch import PyTorchTaskDataDataset + ) -> Optional[Any]: + if _IS_TORCH_INSTALLED: + from fmcore.framework.dl.torch import PyTorchTaskDataset - return PyTorchTaskDataDataset(dataset=self, **kwargs).dataloader() + return PyTorchTaskDataset(dataset=self, **kwargs).dataloader() return None @safe_validate_arguments @@ -1036,14 +1050,16 @@ def filter( f"Passed function must have only one or two args; found: {len(get_fn_args(fn))}" ) rows_to_keep: ScalableSeries = batch.data.apply(fn, args=args) - if (rows_to_keep == False).all(): + if (rows_to_keep == False).all(): # noqa continue filtered_batch: InputOutputDataMixin = batch.update_params(data=batch.data.loc[rows_to_keep]) filtered_batches.append(filtered_batch) return self.concat(filtered_batches) def __filter_yield( - self, fn: Union[Callable[[Dict, Schema], bool], Callable[[Dict], bool]], **kwargs + self, + fn: Union[Callable[[Dict, Schema], bool], Callable[[Dict], bool]], + **kwargs, ) -> Generator[InputOutputDataMixin, None, None]: for batch in self.iter(**kwargs): assert isinstance(batch, InputOutputDataMixin) @@ -1056,7 +1072,7 @@ def __filter_yield( f"Passed function must have only one or two args; found: {len(get_fn_args(fn))}" ) rows_to_keep: ScalableSeries = batch.data.apply(fn, args=args) - if (rows_to_keep == False).all(): + if (rows_to_keep == False).all(): # noqa continue filtered_batch: InputOutputDataMixin = batch.update_params(data=batch.data.loc[rows_to_keep]) yield filtered_batch @@ -1084,14 +1100,14 @@ def __getattr__(self, attr_name: str): if attr_name in self.data.columns_set: return self.data[attr_name] raise AttributeError( - f"`{attr_name}` is neither an attribute of {self.class_name} nor a data colum; " + f"'{attr_name}' is neither an attribute of {self.class_name} nor a data column; " f"current data columns are: {self.data.columns}" ) def __getitem__(self, attr_name: str): if attr_name in self.data.columns_set: return self.data[attr_name] - raise KeyError(f"`{attr_name}` is not a data colum; current data columns are: {self.data.columns}") + raise KeyError(f"'{attr_name}' is not a data column; current data columns are: {self.data.columns}") @safe_validate_arguments def features( diff --git a/src/fmcore/framework/tracker/AimTracker.py b/src/fmcore/framework/_tracker/AimTracker.py similarity index 96% rename from src/fmcore/framework/tracker/AimTracker.py rename to src/fmcore/framework/_tracker/AimTracker.py index e5a503a..a7efb2a 100644 --- a/src/fmcore/framework/tracker/AimTracker.py +++ b/src/fmcore/framework/_tracker/AimTracker.py @@ -1,11 +1,16 @@ import logging from collections import deque from datetime import datetime -from typing import * +from typing import ( + Dict, + List, + Optional, +) -from fmcore.data import FileMetadata -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.util import JupyterNotebook, String, get_default, get_fn_args, keep_keys, optional_dependency +from bears import FileMetadata +from bears.util import JupyterNotebook, String, get_default, get_fn_args, keep_keys, optional_dependency + +from fmcore.framework._tracker.Tracker import Tracker with optional_dependency("aim"): from aim import Repo, Run diff --git a/src/fmcore/framework/tracker/LogFileTracker.py b/src/fmcore/framework/_tracker/LogFileTracker.py similarity index 95% rename from src/fmcore/framework/tracker/LogFileTracker.py rename to src/fmcore/framework/_tracker/LogFileTracker.py index bc0f06f..550aab0 100644 --- a/src/fmcore/framework/tracker/LogFileTracker.py +++ b/src/fmcore/framework/_tracker/LogFileTracker.py @@ -3,14 +3,20 @@ import os.path import threading from datetime import datetime -from typing import * - +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from bears import FileMetadata +from bears.util import Alias, JupyterNotebook, String, get_fn_args, keep_keys, safe_validate_arguments, unset from pydantic import conint from fmcore.constants import Storage -from fmcore.data import FileMetadata -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.util import Alias, JupyterNotebook, String, get_fn_args, keep_keys, safe_validate_arguments, unset +from fmcore.framework._tracker.Tracker import Tracker class LogTrackerFormatter(logging.Formatter): diff --git a/src/fmcore/framework/tracker/Tracker.py b/src/fmcore/framework/_tracker/Tracker.py similarity index 95% rename from src/fmcore/framework/tracker/Tracker.py rename to src/fmcore/framework/_tracker/Tracker.py index 727d09d..6ceb157 100644 --- a/src/fmcore/framework/tracker/Tracker.py +++ b/src/fmcore/framework/_tracker/Tracker.py @@ -1,18 +1,25 @@ import logging from abc import ABC, abstractmethod from functools import partial -from typing import * - -from pydantic import Extra, constr, root_validator +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Optional, + Set, + Tuple, + Type, + Union, +) -from fmcore.constants import _LIBRARY_NAME -from fmcore.data import FileMetadata -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( Alias, FileSystemUtil, Log, MutableParameters, - Parameters, Registry, String, as_list, @@ -22,6 +29,9 @@ random_sample, set_param_from_alias, ) +from pydantic import ConfigDict, constr, model_validator + +from fmcore import _LIBRARY_NAME Tracker = "Tracker" @@ -38,8 +48,9 @@ class Tracker(MutableParameters, Registry, ABC): ) _allow_subclass_override: ClassVar[bool] = True ## Allows replacement of subclass with same name. - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) tracker_name: ClassVar[str] DEFAULT_PROJECTS_BASE_DIR: ClassVar[str] = FileSystemUtil.expand_dir(f"~/{_LIBRARY_NAME}/tracker/") @@ -79,7 +90,8 @@ def of( tracker.initialize(**kwargs) return tracker - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def tracker_params(cls, params: Dict): set_param_from_alias( params, diff --git a/src/fmcore/framework/tracker/__init__.py b/src/fmcore/framework/_tracker/__init__.py similarity index 57% rename from src/fmcore/framework/tracker/__init__.py rename to src/fmcore/framework/_tracker/__init__.py index 6ece1ff..8e38bd1 100644 --- a/src/fmcore/framework/tracker/__init__.py +++ b/src/fmcore/framework/_tracker/__init__.py @@ -1,14 +1,14 @@ import warnings -from fmcore.framework.tracker.Tracker import * -from fmcore.framework.tracker.AimTracker import * -from fmcore.framework.tracker.LogFileTracker import * +from fmcore.framework._tracker.Tracker import * +from fmcore.framework._tracker.AimTracker import * +from fmcore.framework._tracker.LogFileTracker import * DEFAULT_TRACKER: Optional[Tracker] = None try: - from fmcore.util.language import get_default - from fmcore.util.jupyter import JupyterNotebook - from fmcore.util.environment import EnvUtil + from bears.util.language import get_default + from bears.util.jupyter import JupyterNotebook + from bears.util.environment import EnvUtil if JupyterNotebook.is_notebook() and bool(get_default(EnvUtil.get_var('ENABLE_DEFAULT_TRACKER', False))): DEFAULT_TRACKER: Tracker = Tracker.default() diff --git a/src/fmcore/framework/trainer/AccelerateTrainer.py b/src/fmcore/framework/_trainer/AccelerateTrainer.py similarity index 98% rename from src/fmcore/framework/trainer/AccelerateTrainer.py rename to src/fmcore/framework/_trainer/AccelerateTrainer.py index 0272415..838bca9 100644 --- a/src/fmcore/framework/trainer/AccelerateTrainer.py +++ b/src/fmcore/framework/_trainer/AccelerateTrainer.py @@ -2,17 +2,18 @@ import os import warnings from functools import partial -from typing import * +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Type, +) import numpy as np - -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.metric import Metric, Metrics -from fmcore.framework.task_data import Dataset, Datasets, DataSplit -from fmcore.framework.tracker import Tracker -from fmcore.framework.trainer.Trainer import Trainer -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( FileSystemUtil, Log, String, @@ -23,6 +24,12 @@ safe_validate_arguments, ) +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._metric import Metric, Metrics +from fmcore.framework._dataset import Dataset, Datasets, DataSplit +from fmcore.framework._tracker import Tracker +from fmcore.framework._trainer.Trainer import Trainer + with optional_dependency("accelerate", "torch"): from accelerate import Accelerator, notebook_launcher from accelerate.utils import DistributedDataParallelKwargs diff --git a/src/fmcore/framework/trainer/LocalTrainer.py b/src/fmcore/framework/_trainer/LocalTrainer.py similarity index 93% rename from src/fmcore/framework/trainer/LocalTrainer.py rename to src/fmcore/framework/_trainer/LocalTrainer.py index 0e21a52..ae0bcc8 100644 --- a/src/fmcore/framework/trainer/LocalTrainer.py +++ b/src/fmcore/framework/_trainer/LocalTrainer.py @@ -2,15 +2,20 @@ import os import warnings from functools import partial -from typing import * +from typing import ( + Callable, + Dict, + Optional, +) -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.metric import Metric, Metrics -from fmcore.framework.task_data import Dataset, Datasets, DataSplit -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.framework.trainer.Trainer import Trainer -from fmcore.util import String, Timer, safe_validate_arguments +from bears import FileMetadata +from bears.util import String, Timer, safe_validate_arguments + +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._metric import Metric, Metrics +from fmcore.framework._dataset import Dataset, Datasets, DataSplit +from fmcore.framework._tracker.Tracker import Tracker +from fmcore.framework._trainer.Trainer import Trainer class LocalTrainer(Trainer): diff --git a/src/fmcore/framework/trainer/RayTuneTrainer.py b/src/fmcore/framework/_trainer/RayTuneTrainer.py similarity index 99% rename from src/fmcore/framework/trainer/RayTuneTrainer.py rename to src/fmcore/framework/_trainer/RayTuneTrainer.py index aa13713..dfc8b02 100644 --- a/src/fmcore/framework/trainer/RayTuneTrainer.py +++ b/src/fmcore/framework/_trainer/RayTuneTrainer.py @@ -4,29 +4,12 @@ import random import warnings from functools import partial -from typing import * +from typing import Any, Callable, ClassVar, Dict, List, Literal, Optional, Set, Tuple, Type, Union import numpy as np import pandas as pd -from pydantic import Extra, confloat, conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import DataLayout -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.dl.torch import PyTorch -from fmcore.framework.metric import ( - CountingMetric, - Metric, - Metrics, - PercentageMetric, - TabularMetric, - metric_stats_str, -) -from fmcore.framework.task_data import Dataset, Datasets, DataSplit -from fmcore.framework.tracker.Tracker import Tracker -from fmcore.framework.trainer.Trainer import KFold, Trainer -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( FileSystemUtil, Log, MappedParameters, @@ -45,7 +28,23 @@ set_param_from_alias, type_str, ) -from fmcore.util.language._import import _IS_RAY_INSTALLED, _IS_TORCH_INSTALLED +from bears.util.language._import import _IS_RAY_INSTALLED, _IS_TORCH_INSTALLED +from pydantic import ConfigDict, confloat, conint, model_validator + +from fmcore.constants import DataLayout +from fmcore.framework._algorithm import Algorithm +from fmcore.framework.dl.torch import PyTorch +from fmcore.framework._metric import ( + CountingMetric, + Metric, + Metrics, + PercentageMetric, + TabularMetric, + metric_stats_str, +) +from fmcore.framework._dataset import Dataset, Datasets, DataSplit +from fmcore.framework._tracker.Tracker import Tracker +from fmcore.framework._trainer.Trainer import KFold, Trainer RayTuneTrainer = "RayTuneTrainer" _RAY_TRIAL_ID: str = "trial_id" @@ -145,7 +144,7 @@ def _ray_agg_final_model_metric_stats( if student_metrics[student_metric_name]["metric_class"] is None: student_metrics[student_metric_name]["metric_class"] = type(student_metric) else: - assert student_metrics[student_metric_name]["metric_class"] == type(student_metric) + assert student_metrics[student_metric_name]["metric_class"] is type(student_metric) student_metrics_stats: Dict[str, Dict[str, Union[int, float]]] = {} for student_metric_name, student_metric_d in student_metrics.items(): if issubclass(student_metric_d["metric_class"], (PercentageMetric, CountingMetric)): @@ -239,7 +238,7 @@ def _ray_agg_final_model_metric_stats( ## stopped. A paused trial can later be resumed from the most recent checkpoint. class HyperparameterSearchSpace(MappedParameters): - _mapping = append_to_keys( + mapping_dict: ClassVar[Dict[str, Callable]] = append_to_keys( prefix="tune.", d={ ## Ref: https://docs.ray.io/en/latest/tune/api_docs/search_space.html @@ -283,7 +282,7 @@ class HyperparameterSearchSpace(MappedParameters): ) class SearchAlgorithm(MappedParameters): - _mapping = { + mapping_dict: ClassVar[Dict[str, Type]] = { **{k: importer() for k, importer in SEARCH_ALG_IMPORT.items()}, **{"grid": SEARCH_ALG_IMPORT["variant_generator"]()}, } @@ -678,8 +677,9 @@ def cleanup(self): class RayTuneTrainer(Trainer): aliases = ["ray", "ray_tune"] - class Config(Trainer.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) class RunConfig(Trainer.RunConfig): ray_init: RayInitConfig = {} @@ -706,7 +706,8 @@ class RunConfig(Trainer.RunConfig): tune_failure_retries: int = 0 tune_failure_retry_wait: int = 60 ## Seconds - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def ray_trainer_params(cls, params: Dict) -> Dict: ## Aliases for search_alg: set_param_from_alias(params, param="search_algorithm", alias=["search_alg"]) diff --git a/src/fmcore/framework/trainer/Trainer.py b/src/fmcore/framework/_trainer/Trainer.py similarity index 98% rename from src/fmcore/framework/trainer/Trainer.py rename to src/fmcore/framework/_trainer/Trainer.py index d845792..83b76e9 100644 --- a/src/fmcore/framework/trainer/Trainer.py +++ b/src/fmcore/framework/_trainer/Trainer.py @@ -3,19 +3,10 @@ import time from abc import ABC, abstractmethod from copy import deepcopy -from typing import * +from typing import Any, Callable, ClassVar, Dict, List, Literal, Optional, Set, Tuple, Type, TypeVar, Union -from pydantic import Extra, conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import DataSplit, Task -from fmcore.data import FileMetadata -from fmcore.framework.algorithm import Algorithm, TaskOrStr -from fmcore.framework.metric import CountingMetric, Metric, Metrics -from fmcore.framework.predictions import Predictions, save_predictions -from fmcore.framework.task_data import Dataset, Datasets, save_dataset -from fmcore.framework.tracker import DEFAULT_TRACKER_PARAMS, Tracker -from fmcore.util import ( +from bears import FileMetadata +from bears.util import ( Alias, FractionalBool, NeverFailJsonEncoder, @@ -36,8 +27,15 @@ set_param_from_alias, type_str, ) +from pydantic import ConfigDict, conint, model_validator + +from fmcore.constants import DataSplit, Task +from fmcore.framework._algorithm import Algorithm, TaskOrStr +from fmcore.framework._metric import CountingMetric, Metric, Metrics +from fmcore.framework._predictions import Predictions, save_predictions +from fmcore.framework._dataset import Dataset, Datasets, save_dataset +from fmcore.framework._tracker import DEFAULT_TRACKER_PARAMS, Tracker -Trainer = "Trainer" TrainerSubclass = TypeVar("TrainerSubclass", bound="Trainer") @@ -138,12 +136,14 @@ class Trainer(Parameters, Registry, ABC): _allow_subclass_override: ClassVar[bool] = True ## Allows replacement of subclass with same name. trainer_metrics: ClassVar[Tuple[str, ...]] = ("row_count",) - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) class RunConfig(Parameters): - class Config(Parameters.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) AlgorithmClass: Union[Type[Algorithm], Algorithm, str] task: TaskOrStr = None @@ -165,7 +165,8 @@ class Config(Parameters.Config): ## Logging verbosity. 0 = zero logging, 1 = Basic logging, 2 = verbose logging. verbosity: conint(ge=0) = 1 - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_trainer_params(cls, params: Dict): set_param_from_alias(params, param="AlgorithmClass", alias=["algorithm"]) set_param_from_alias(params, param="eval_batch_size", alias=["predict_batch_size"]) diff --git a/src/fmcore/framework/_trainer/__init__.py b/src/fmcore/framework/_trainer/__init__.py new file mode 100644 index 0000000..c68566f --- /dev/null +++ b/src/fmcore/framework/_trainer/__init__.py @@ -0,0 +1,4 @@ +from fmcore.framework._trainer.Trainer import * +from fmcore.framework._trainer.LocalTrainer import * +from fmcore.framework._trainer.AccelerateTrainer import * +from fmcore.framework._trainer.RayTuneTrainer import * diff --git a/src/fmcore/framework/visualize.py b/src/fmcore/framework/_visualize.py similarity index 97% rename from src/fmcore/framework/visualize.py rename to src/fmcore/framework/_visualize.py index 548903e..9787795 100644 --- a/src/fmcore/framework/visualize.py +++ b/src/fmcore/framework/_visualize.py @@ -1,13 +1,17 @@ from abc import ABC, abstractmethod -from functools import singledispatchmethod -from typing import * - -from pydantic import Extra, constr, root_validator +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Optional, + Set, + Type, + Union, +) -from fmcore.constants import VISUALIZATION_BACKEND_DEPENDENCIES, VisualizationBackend -from fmcore.framework.predictions import Predictions -from fmcore.framework.task_data import Dataset -from fmcore.util import ( +from bears.util import ( Parameters, Registry, String, @@ -17,6 +21,11 @@ optional_dependency, safe_validate_arguments, ) +from pydantic import ConfigDict, constr, model_validator + +from fmcore.constants import VISUALIZATION_BACKEND_DEPENDENCIES, VisualizationBackend +from fmcore.framework._predictions import Predictions +from fmcore.framework._dataset import Dataset class Visualization(Parameters, Registry, ABC): @@ -115,9 +124,6 @@ def available_backends(cls, name: str) -> Set[VisualizationBackend]: available_backends.add(Subclass.backend) return available_backends - class Config(Parameters.Config): - keep_untouched = (singledispatchmethod,) - class Params(Parameters): """ BaseModel for parameters. Expected to be overridden by subclasses. @@ -143,13 +149,15 @@ class Params(Visualization.Params): yaxis_position: str = "left" decimals: int = 3 - class Config(Parameters.Config): - extra = Extra.ignore + model_config = ConfigDict( + extra="ignore", + ) name: constr(min_length=1) params: Params = {} - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def convert_params(cls, params: Dict): params["params"] = super(Visualization, cls)._convert_params(cls.Params, params.get("params")) params["name"] = cls.class_name diff --git a/src/fmcore/framework/chain/__init__.py b/src/fmcore/framework/chain/__init__.py deleted file mode 100644 index 66c746f..0000000 --- a/src/fmcore/framework/chain/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from fmcore.framework.chain.Chain import * \ No newline at end of file diff --git a/src/fmcore/framework/dl/__init__.py b/src/fmcore/framework/dl/__init__.py index e69de29..c779ed0 100644 --- a/src/fmcore/framework/dl/__init__.py +++ b/src/fmcore/framework/dl/__init__.py @@ -0,0 +1 @@ +from fmcore.framework.dl.torch import * \ No newline at end of file diff --git a/src/fmcore/framework/dl/torch/__init__.py b/src/fmcore/framework/dl/torch/__init__.py index 68e2a97..d1cc7b0 100644 --- a/src/fmcore/framework/dl/torch/__init__.py +++ b/src/fmcore/framework/dl/torch/__init__.py @@ -1,3 +1,3 @@ -from fmcore.framework.dl.torch.torch_base import * -from fmcore.framework.dl.torch.torch_tasks import * -from fmcore.framework.dl.torch.PyTorchTaskDataDataset import * +from fmcore.framework.dl.torch._torch_base import * +from fmcore.framework.dl.torch._torch_tasks import * +from fmcore.framework.dl.torch._torch_dataset import * diff --git a/src/fmcore/framework/dl/torch/torch_base.py b/src/fmcore/framework/dl/torch/_torch_base.py similarity index 96% rename from src/fmcore/framework/dl/torch/torch_base.py rename to src/fmcore/framework/dl/torch/_torch_base.py index ef20e9f..3985116 100644 --- a/src/fmcore/framework/dl/torch/torch_base.py +++ b/src/fmcore/framework/dl/torch/_torch_base.py @@ -1,16 +1,27 @@ import gc from abc import ABC, abstractmethod from functools import partial -from typing import * - -from pydantic import conint, root_validator +from typing import ( + Any, + ClassVar, + Dict, + Generator, + List, + Optional, + Set, + Tuple, + Type, + Union, +) + +from bears.util import MappedParameters, get_default, optional_dependency, safe_validate_arguments +from bears.util.language._import import _IS_TORCH_INSTALLED +from pydantic import conint, model_validator from fmcore.constants import DataLayout, DataPosition, MLTypeSchema -from fmcore.framework.algorithm import Algorithm -from fmcore.framework.predictions import Predictions -from fmcore.framework.task_data import Dataset -from fmcore.util import MappedParameters, get_default, optional_dependency, safe_validate_arguments -from fmcore.util.language._import import _IS_TORCH_INSTALLED +from fmcore.framework._algorithm import Algorithm +from fmcore.framework._predictions import Predictions +from fmcore.framework._dataset import Dataset PyTorch = "PyTorch" @@ -125,14 +136,14 @@ def validate_data_on_device( return True class Optimizer(MappedParameters): - _mapping: ClassVar[Dict[str, Type[TorchOptimizer]]] = { + mapping_dict: ClassVar[Dict[str, Type[TorchOptimizer]]] = { name: val for name, val in torch.optim.__dict__.items() if isinstance(val, type) and issubclass(val, TorchOptimizer) } class Loss(MappedParameters): - _mapping: ClassVar[Dict[str, Type[TorchLoss]]] = { + mapping_dict: ClassVar[Dict[str, Type[TorchLoss]]] = { "KLDivLoss": torch.nn.KLDivLoss, "NLLLoss": torch.nn.NLLLoss, "SmoothL1Loss": torch.nn.SmoothL1Loss, @@ -197,7 +208,7 @@ def _get_linear_schedule_with_warmup_lr_lambda( ) class LRScheduler(MappedParameters): - _mapping: ClassVar[Dict[str, Type[TorchLRScheduler]]] = { + mapping_dict: ClassVar[Dict[str, Type[TorchLRScheduler]]] = { "LinearLR": torch.optim.lr_scheduler.LinearLR, "ConstantLR": torch.optim.lr_scheduler.ConstantLR, "LambdaLR": torch.optim.lr_scheduler.LambdaLR, @@ -247,8 +258,10 @@ class Hyperparameters(Algorithm.Hyperparameters): } gradient_accumulation_steps: conint(ge=1) = 1 - @root_validator(pre=False) ## Run this post all values set by subclasses. + @model_validator(mode="before") + @classmethod ## Run this post all values set by subclasses. def convert_hyperparams(cls, hyperparams: Dict) -> Dict: + cls.set_default_param_values(hyperparams) if hyperparams.get("optimizer", None) is not None: hyperparams["optimizer"] = Optimizer.of(hyperparams["optimizer"]) diff --git a/src/fmcore/framework/dl/torch/PyTorchTaskDataDataset.py b/src/fmcore/framework/dl/torch/_torch_dataset.py similarity index 97% rename from src/fmcore/framework/dl/torch/PyTorchTaskDataDataset.py rename to src/fmcore/framework/dl/torch/_torch_dataset.py index f975be6..09afae5 100644 --- a/src/fmcore/framework/dl/torch/PyTorchTaskDataDataset.py +++ b/src/fmcore/framework/dl/torch/_torch_dataset.py @@ -1,14 +1,21 @@ -from typing import * +from typing import ( + Any, + Dict, + Generator, +) -from fmcore.framework.mixins import InputOutputDataMixin -from fmcore.util.language._import import _IS_TORCH_INSTALLED +from bears.util.language._import import _IS_TORCH_INSTALLED + +from fmcore.framework._task_mixins import InputOutputDataMixin + +PyTorchTaskDataset = "PyTorchTaskDataset" if _IS_TORCH_INSTALLED: import torch from torch.utils.data import DataLoader as TorchDataLoader from torch.utils.data import IterableDataset as TorchIterableDataset - class PyTorchTaskDataDataset(TorchIterableDataset): + class PyTorchTaskDataset(TorchIterableDataset): """ PyTorch has two kinds of datasets: map-style (torch.utils.data.Dataset) and iterable-style (torch.utils.data.IterableDataset). diff --git a/src/fmcore/framework/dl/torch/torch_tasks.py b/src/fmcore/framework/dl/torch/_torch_tasks.py similarity index 94% rename from src/fmcore/framework/dl/torch/torch_tasks.py rename to src/fmcore/framework/dl/torch/_torch_tasks.py index 754afd7..7287f6e 100644 --- a/src/fmcore/framework/dl/torch/torch_tasks.py +++ b/src/fmcore/framework/dl/torch/_torch_tasks.py @@ -1,12 +1,24 @@ from abc import ABC, abstractmethod -from typing import * +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Literal, + Optional, + Tuple, + Union, +) import numpy as np -from pydantic.typing import Literal +from bears import FileMetadata +from bears.util import FileSystemUtil +from bears.util.language._import import _IS_TORCH_INSTALLED from fmcore.constants import Storage, Task -from fmcore.data import FileMetadata -from fmcore.framework.task import ( +from fmcore.framework._dataset import Dataset +from fmcore.framework._task import ( ClassificationData, Classifier, Embedder, @@ -15,9 +27,6 @@ MultiLabelClassifier, Regressor, ) -from fmcore.framework.task_data import Dataset -from fmcore.util import FileSystemUtil -from fmcore.util.language._import import _IS_TORCH_INSTALLED PyTorchBaseModel = "PyTorchBaseModel" if _IS_TORCH_INSTALLED: @@ -25,8 +34,9 @@ from torch import Tensor from torch.nn import Module as TorchModule from torch.nn.functional import softmax + from torch.optim import Optimizer as TorchOptimizer - from fmcore.framework.dl.torch.torch_base import Loss, PyTorch, is_accelerator + from fmcore.framework.dl.torch._torch_base import Loss, PyTorch, is_accelerator class PyTorchBaseModel(Embedder, PyTorch, ABC): @classmethod @@ -198,7 +208,7 @@ def prepare_target( **kwargs, ) -> Tensor: target = batch.ground_truths().torch().squeeze() - if len(target.shape) == 0: ## We accidentally converted it into a Scalar. + if len(target.shape) == 0: ## We accidentally converted it into a scalar. target: Tensor = target.unsqueeze(0) return target @@ -217,8 +227,8 @@ class PyTorchClassifier(PyTorchClassifierMixin, PyTorchTaskHead): class Hyperparameters(PyTorchTaskHead.Hyperparameters): dropout: float = 0.1 - loss: Union[Loss, Dict, str] = "CrossEntropyLoss" - optimizer = dict( + loss: Loss = "CrossEntropyLoss" + optimizer: TorchOptimizer = dict( name="AdamW", lr=5e-5, weight_decay=1e-7, @@ -286,7 +296,7 @@ def prepare_predictions(self, output: Tensor, **kwargs) -> Dict[str, np.ndarray] class PyTorchMultiLabelClassifier(PyTorchMultiLabelClassifierMixin, PyTorchClassifier): class Hyperparameters(PyTorchClassifier.Hyperparameters): - loss: Union[Loss, Dict, str] = "BCEWithLogitsLoss" + loss: Loss = "BCEWithLogitsLoss" class PyTorchRegressorMixin(Regressor, PyTorch, ABC): def prepare_target( @@ -295,7 +305,7 @@ def prepare_target( **kwargs, ) -> Tensor: target: Tensor = batch.ground_truths().torch().squeeze() - if len(target.shape) == 0: ## We accidentally converted it into a Scalar. + if len(target.shape) == 0: ## We accidentally converted it into a scalar. target: Tensor = target.unsqueeze(0) return target @@ -316,8 +326,8 @@ class PyTorchRegressor(PyTorchRegressorMixin, PyTorchTaskHead): class Hyperparameters(PyTorchTaskHead.Hyperparameters): dropout: float = 0.1 - loss = "MSELoss" - optimizer = dict( + loss: Loss = "MSELoss" + optimizer: TorchOptimizer = dict( name="AdamW", lr=5e-5, weight_decay=1e-7, diff --git a/src/fmcore/framework/evaluator/__init__.py b/src/fmcore/framework/evaluator/__init__.py deleted file mode 100644 index 33586a3..0000000 --- a/src/fmcore/framework/evaluator/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from fmcore.framework.evaluator.Evaluator import * -from fmcore.framework.evaluator.LocalEvaluator import * -from fmcore.framework.evaluator.AccelerateEvaluator import * -from fmcore.framework.evaluator.RayEvaluator import * \ No newline at end of file diff --git a/src/fmcore/framework/task/__init__.py b/src/fmcore/framework/task/__init__.py deleted file mode 100644 index 6d35ab3..0000000 --- a/src/fmcore/framework/task/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from fmcore.framework.task.regression import * -from fmcore.framework.task.classification import * -from fmcore.framework.task.embedding import * -from fmcore.framework.task.text_generation import * -from fmcore.framework.task.retrieval import * -from fmcore.framework.task.dense_retrieval import * -from fmcore.framework.task.sparse_retrieval import * diff --git a/src/fmcore/framework/trainer/__init__.py b/src/fmcore/framework/trainer/__init__.py deleted file mode 100644 index 677b759..0000000 --- a/src/fmcore/framework/trainer/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from fmcore.framework.trainer.Trainer import * -from fmcore.framework.trainer.LocalTrainer import * -from fmcore.framework.trainer.AccelerateTrainer import * -from fmcore.framework.trainer.RayTuneTrainer import * diff --git a/src/fmcore/metric/__init__.py b/src/fmcore/metric/__init__.py index 24383a2..c86950c 100644 --- a/src/fmcore/metric/__init__.py +++ b/src/fmcore/metric/__init__.py @@ -1,7 +1,7 @@ from importlib import import_module import os from pathlib import Path -from fmcore.util.language import String +from bears.util.language import String __THIS_FILE__ = __file__ ## Needed when calling reload() from outside this file. diff --git a/src/fmcore/metric/classification_metrics.py b/src/fmcore/metric/classification_metrics.py index e9b6e27..2f40561 100644 --- a/src/fmcore/metric/classification_metrics.py +++ b/src/fmcore/metric/classification_metrics.py @@ -1,11 +1,20 @@ from abc import ABC, abstractmethod from collections import defaultdict -from typing import * +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Optional, + Set, + Tuple, +) import numpy as np import pandas as pd +from bears.util import argmax, as_tuple, check_isinstance, dict_key_with_best_value, type_str from pydantic import confloat -from pydantic.typing import Literal from fmcore.constants import AggregationStrategy, DataSplit, Task from fmcore.framework import ( @@ -18,8 +27,7 @@ TabularMetric, TopKClassificationPredictions, ) -from fmcore.framework.trainer.RayTuneTrainer import _RAY_TRAINING_ITERATION, _RAY_TRIAL_ID -from fmcore.util import argmax, as_tuple, check_isinstance, dict_key_with_best_value, type_str +from fmcore.framework._trainer.RayTuneTrainer import _RAY_TRAINING_ITERATION, _RAY_TRIAL_ID def _check_clf_preds(data: Predictions): diff --git a/src/fmcore/metric/regression_metrics.py b/src/fmcore/metric/regression_metrics.py index 4ecfe69..42b4b8e 100644 --- a/src/fmcore/metric/regression_metrics.py +++ b/src/fmcore/metric/regression_metrics.py @@ -1,7 +1,6 @@ import math from decimal import Decimal from fractions import Fraction -from typing import * from pydantic import confloat diff --git a/src/fmcore/metric/text_generation_metrics.py b/src/fmcore/metric/text_generation_metrics.py index 59ad921..3b66d4b 100644 --- a/src/fmcore/metric/text_generation_metrics.py +++ b/src/fmcore/metric/text_generation_metrics.py @@ -2,40 +2,12 @@ import math import random from abc import ABC -from typing import * +from collections import Counter +from typing import Any, Callable, ClassVar, Dict, List, Literal, Optional, Tuple, Union import numpy as np import pandas as pd -from pydantic import Extra, confloat, conint, root_validator -from pydantic.typing import Literal - -from fmcore.constants import AggregationStrategy, DataLayout, DataSplit, MLType, Parallelize, Task, TaskOrStr -from fmcore.framework import ( - GENERATED_TEXTS_COL, - PROMPT_TEMPLATE_INDEX_COL_PREFIX, - TEXT_PROMPT_COL, - ClassificationData, - Dataset, - Datasets, - Evaluator, - FileMetadata, - Metric, - Metrics, - NextTokens, - PercentageMetric, - Predictions, - Prompts, - RayTuneTrainer, - RayTuneTrainerFinalModelsError, - TabularMetric, - TextGenerations, - TextGenerationsPredictionsBase, - Trainer, -) -from fmcore.framework.dl.torch import clear_device_cache -from fmcore.framework.evaluator.RayEvaluator import LoadBalancingStrategy -from fmcore.framework.trainer.RayTuneTrainer import _ray_agg_final_model_metric_stats -from fmcore.util import ( +from bears.util import ( Alias, EnvUtil, String, @@ -62,6 +34,34 @@ set_param_from_alias, type_str, ) +from pydantic import ConfigDict, confloat, conint, model_validator + +from fmcore.constants import AggregationStrategy, DataLayout, DataSplit, MLType, Parallelize, Task, TaskOrStr +from fmcore.framework import ( + GENERATED_TEXTS_COL, + PROMPT_TEMPLATE_INDEX_COL_PREFIX, + TEXT_PROMPT_COL, + ClassificationData, + Dataset, + Datasets, + Evaluator, + FileMetadata, + Metric, + Metrics, + NextTokens, + PercentageMetric, + Predictions, + Prompts, + RayTuneTrainer, + RayTuneTrainerFinalModelsError, + TabularMetric, + TextGenerations, + TextGenerationsPredictionsBase, + Trainer, +) +from fmcore.framework.dl.torch import clear_device_cache +from fmcore.framework._evaluator.RayEvaluator import LoadBalancingStrategy +from fmcore.framework._trainer.RayTuneTrainer import _ray_agg_final_model_metric_stats class TextLength(TabularMetric): @@ -282,8 +282,9 @@ class RagasFaithfulness(RagasMetricBase): aliases = ["faithfulness"] class Params(RagasMetricBase.Params): - class Config(PercentageMetric.Params.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) statement_extraction_prompt: str = ( """ @@ -319,8 +320,10 @@ class Config(PercentageMetric.Params.Config): "As per my knowledge", ] - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def _set_faithfulness_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) if String.punct_normalize(params["algorithm"]) in { String.punct_normalize("bedrock") } and String.punct_normalize("anthropic.claude") in String.punct_normalize( @@ -733,7 +736,7 @@ class RagasContextRelevance(RagasMetricBase): """ From the paper: https://arxiv.org/abs/2309.15217 "The context c(q) is considered relevant to the extent that it exclusively contains information that is - needed to answer the question. In particular, this metric aims to penalise the inclusion of redundant + needed to answer the question. In particular, this metric aims to penalize the inclusion of redundant information. [Step 1] To estimate context relevance, given a question q and its context c(q), the LLM extracts a @@ -752,8 +755,9 @@ class RagasContextRelevance(RagasMetricBase): aliases = ["context_relevance"] class Params(RagasMetricBase.Params): - class Config(PercentageMetric.Params.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) relevant_context_extraction_prompt: str = ( """ @@ -771,8 +775,10 @@ class Config(PercentageMetric.Params.Config): ("Relevant Sentences:", "Assistant:"), ] - @root_validator(pre=False) + @model_validator(mode="before") + @classmethod def _set_context_relevance_params(cls, params: Dict) -> Dict: + cls.set_default_param_values(params) if String.punct_normalize(params["algorithm"]) in { String.punct_normalize("bedrock") } and String.punct_normalize("anthropic.claude") in String.punct_normalize( @@ -1081,8 +1087,9 @@ def _relevant_context_parser(cls, relevant_context: str) -> Optional[Union[List[ class Mauve(PercentageMetric): class Params(PercentageMetric.Params): - class Config(PercentageMetric.Params.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) references_col: str generations_col: str = GENERATED_TEXTS_COL @@ -1151,8 +1158,9 @@ def calc_mauve( class EntityCount(TabularMetric): class Params(TabularMetric.Params): - class Config(TabularMetric.Params.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) num_cpus: int = 8 num_gpus: int = 0 @@ -1160,13 +1168,13 @@ class Config(TabularMetric.Params.Config): batch_size: int = 50 generations_col: str = GENERATED_TEXTS_COL - ## Overvall counts of entities: + ## Overall counts of entities: _entity_counts: Counter = Counter() - ## Overvall counts of entity-labels: + ## Overall counts of entity-labels: _entity_label_counts: Counter = Counter() ## "Apple" can be the company or the fruit; this counts the spread of labels for each identified entity: _entitywise_label_counts: Dict[str, Counter] = {} - ## Number of entites per row + ## Number of entities per row _row_num_entities: Counter = Counter() _num_rows: int = 0 _entity_count_df: Optional[pd.DataFrame] = None @@ -1280,8 +1288,9 @@ class SelfBLEU(TabularMetric): aliases = ["Self-BLEU"] class Params(TabularMetric.Params): - class Config(TabularMetric.Params.Config): - extra = Extra.allow + model_config = ConfigDict( + extra="allow", + ) num_cpus: int = 8 num_gpus: int = 0 @@ -1367,10 +1376,10 @@ def spacy_tokenize_docs( nlp: Language = spacy.load(spacy_tokenization_model, disable=["parser", "tagger", "ner"]) tokenized_docs: List[List[str]] = [] for sent_doc in nlp.pipe(docs, n_process=max_workers, batch_size=batch_size): - toks: List[str] = [] + tokens: List[str] = [] for tok in sent_doc: - toks.append(tok.text) - tokenized_docs.append(toks) + tokens.append(tok.text) + tokenized_docs.append(tokens) return tokenized_docs @classmethod @@ -1529,7 +1538,8 @@ class Params(Metric.Params): max_retries: int = 1 verbosity: int = 0 - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_metric_params(cls, params: Dict) -> Dict: Alias.set_metrics(params) if params.get("metrics") is not None: @@ -1639,7 +1649,8 @@ class Params(Metric.Params): verbosity: int = 0 save_to: Optional[FileMetadata] = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _set_metric_params(cls, params: Dict) -> Dict: Alias.set_metrics(params) params["metrics"]: Metrics = Metrics.of(params["metrics"]) @@ -1853,7 +1864,7 @@ def metric_stats(self, data_split: DataSplit) -> Dict[str, Dict[str, Union[int, from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity - from fmcore.framework.task.dense_retrieval import _normalize_l2 + from fmcore.framework._task.dense_retrieval import _normalize_l2 class LabelwiseCosineSimilarity(TabularMetric): class Params(TabularMetric.Params): @@ -2005,22 +2016,22 @@ def to_labelwise_cosine_sims( agg: AggregationStrategy = AggregationStrategy(agg) labelspace: List[str] = sorted(list(pairwise_cosine_sims[index_col].apply(get_lb).unique())) assert len(labelspace) > 1 - labelsiwise_cosine_sims = {} + labelwise_cosine_sims = {} for idx_i, cosine_sims in zip( pairwise_cosine_sims[index_col], pairwise_cosine_sims["cosine_sims"] ): - labelsiwise_cosine_sims.setdefault(get_lb(idx_i), {}) + labelwise_cosine_sims.setdefault(get_lb(idx_i), {}) for idx_j, cosine_sim in cosine_sims.items(): - labelsiwise_cosine_sims[get_lb(idx_i)].setdefault(get_lb(idx_j), []) - labelsiwise_cosine_sims[get_lb(idx_i)][get_lb(idx_j)].append(round(float(cosine_sim), 6)) - labelsiwise_cosine_sims_agg: Dict[str, Dict[str, Union[float, List[float]]]] = {} - for lb_i, d in labelsiwise_cosine_sims.items(): + labelwise_cosine_sims[get_lb(idx_i)].setdefault(get_lb(idx_j), []) + labelwise_cosine_sims[get_lb(idx_i)][get_lb(idx_j)].append(round(float(cosine_sim), 6)) + labelwise_cosine_sims_agg: Dict[str, Dict[str, Union[float, List[float]]]] = {} + for lb_i, d in labelwise_cosine_sims.items(): assert isinstance(lb_i, str) - labelsiwise_cosine_sims_agg.setdefault(lb_i, {}) - for lb_j, cosine_sims_list in labelsiwise_cosine_sims[lb_i].items(): + labelwise_cosine_sims_agg.setdefault(lb_i, {}) + for lb_j, cosine_sims_list in labelwise_cosine_sims[lb_i].items(): assert isinstance(lb_j, str) - labelsiwise_cosine_sims_agg[lb_i][lb_j] = self._aggregate(cosine_sims_list, agg=agg) - return pd.DataFrame(labelsiwise_cosine_sims_agg)[labelspace] + labelwise_cosine_sims_agg[lb_i][lb_j] = self._aggregate(cosine_sims_list, agg=agg) + return pd.DataFrame(labelwise_cosine_sims_agg)[labelspace] @classmethod def _aggregate( diff --git a/src/fmcore/util/__init__.py b/src/fmcore/util/__init__.py deleted file mode 100644 index 1f0c1a0..0000000 --- a/src/fmcore/util/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from fmcore.util.language import * -from fmcore.util.language import * -from fmcore.util.jupyter import * -from fmcore.util.concurrency import * -from fmcore.util.profiling import * -from fmcore.util.environment import * -from fmcore.util.filesystem import * -from fmcore.util.logging import * -from fmcore.util.schema import * -from fmcore.util.notify import * diff --git a/src/fmcore/util/aws/__init__.py b/src/fmcore/util/aws/__init__.py deleted file mode 100644 index fb5657e..0000000 --- a/src/fmcore/util/aws/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from fmcore.util.aws.s3 import * \ No newline at end of file diff --git a/src/fmcore/util/aws/s3.py b/src/fmcore/util/aws/s3.py deleted file mode 100644 index 4c89a6d..0000000 --- a/src/fmcore/util/aws/s3.py +++ /dev/null @@ -1,559 +0,0 @@ -import fnmatch -import io -import math -import os -import pickle -import random -import time -from typing import * -from urllib.parse import ParseResult, urlparse - -import boto3 - -from fmcore.constants import _LIBRARY_NAME -from fmcore.util import FileSystemUtil, Log, String, Timer, Utility, shuffle_items -from fmcore.util.language import any_are_none, as_list, remove_values - - -class S3Util(Utility): - S3_BUCKET = "Bucket" - OBJECT_KEY = "Key" - ACL = "ACL" - S3_PATH_REGEX = String.CARET + String.S3_PREFIX + "(\S+?)" + String.SLASH + "(\S+)" + String.DOLLAR - ## Permissions: - S3_BUCKET_GET_OBJ_PERMISSION = "s3:GetObject" - S3_BUCKET_LIST_PERMISSION = "s3:ListBucket" - S3_BUCKET_PUT_OBJ_PERMISSION = "s3:PutObject" - S3_BUCKET_DELETE_OBJ_PERMISSION = "s3:DeleteObject" - - S3_BUCKET_OWNER_FULL_CONTROL_ACL = "bucket-owner-full-control" - - @classmethod - def s3_path_exploder(cls, s3_path: str) -> Tuple[str, str]: - s3_path = String.assert_not_empty_and_strip(s3_path) - s3_parsed_result: ParseResult = urlparse(s3_path) - s3_bucket, object_key = None, None - if String.is_not_empty(s3_parsed_result.netloc) and String.SPACE not in s3_parsed_result.netloc: - s3_bucket = s3_parsed_result.netloc - if String.is_not_empty(s3_parsed_result.path) and s3_bucket is not None: - object_key = String.remove_prefix(s3_parsed_result.path, String.SLASH) - return s3_bucket, object_key - - @classmethod - def s3_path_exploder_dict(cls, s3_path: str) -> Dict[str, str]: - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - return {cls.S3_BUCKET: s3_bucket, cls.OBJECT_KEY: object_key} - - @classmethod - def is_valid_s3_path(cls, s3_path: str) -> bool: - s3_path = String.assert_not_empty_and_strip(s3_path) - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - return String.is_not_empty(s3_bucket) and String.is_not_empty(object_key) - - @classmethod - def is_path_valid_s3_dir(cls, s3_path: str) -> bool: - return cls.is_valid_s3_path(s3_path) and s3_path.endswith(String.SLASH) - - @classmethod - def get_s3_dir(cls, s3_path: str) -> str: - if cls.is_path_valid_s3_dir(s3_path): - return s3_path - return String.SLASH.join(s3_path.split(String.SLASH)[:-1]) + String.SLASH - - @classmethod - def check_bucket_permission(cls, s3_path: str, action_names: Union[str, List[str]]) -> bool: - s3_bucket, _ = cls.s3_path_exploder(s3_path) - if isinstance(action_names, str): - action_names: List[str] = [action_names] - ## Ref: https://stackoverflow.com/a/47058571 - iam = boto3.client("iam") - sts = boto3.client("sts") - # Get the arn represented by the currently configured credentials - arn = sts.get_caller_identity()["Arn"] - # Create an arn representing the objects in a bucket - bucket_objects_arn = f"arn:aws:s3:::{s3_bucket}/*" - # Run the policy simulation for the basic s3 operations - results = iam.simulate_principal_policy( - PolicySourceArn=arn, ResourceArns=[bucket_objects_arn], ActionNames=action_names - ) - for policy_result in results["EvaluationResults"]: - if policy_result["EvalDecision"] != "allowed": - return False - return True - - @classmethod - def can_read_bucket(cls, s3_path: str) -> bool: - return cls.check_bucket_permission( - s3_path, [cls.S3_BUCKET_GET_OBJ_PERMISSION, cls.S3_BUCKET_LIST_PERMISSION] - ) - - @classmethod - def can_write_bucket(cls, s3_path: str) -> bool: - return cls.check_bucket_permission(s3_path, cls.S3_BUCKET_PUT_OBJ_PERMISSION) - - @classmethod - def can_delete_from_bucket(cls, s3_path: str) -> bool: - return cls.check_bucket_permission(s3_path, cls.S3_BUCKET_DELETE_OBJ_PERMISSION) - - @classmethod - def s3_object_exists(cls, s3_path: str): - return cls.get_s3_object_details(s3_path, log_error=False) is not None - - @classmethod - def s3_object_does_not_exist(cls, s3_path: str): - String.assert_not_empty(s3_path) - return not cls.s3_object_exists(s3_path) - - @classmethod - def get_s3_object_details(cls, s3_path: str, log_error: bool = True): - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3 = boto3.client("s3") - try: - return s3.head_object(Bucket=s3_bucket, Key=object_key) - except Exception as e: - if log_error: - if e.response.get("Error").get("Code") == 404: - Log.error("Bucket %s does not contain key %s" % (s3_bucket, object_key)) - else: - Log.error(str(e)) - return None - - @classmethod - def get_s3_object_etag(cls, s3_path: str): - s3_resp = cls.get_s3_object_details(s3_path) - if s3_resp is None: - return None - etag = s3_resp["ETag"].strip('"') - ## To handle etags of multi-part uploads. Ref: https://stackoverflow.com/q/6591047/4900327 - etag = etag.split("-")[0] - return etag - - @classmethod - def get_s3_object_size( - cls, - s3_path: Union[List[str], str], - unit: Optional[str] = None, - decimals: int = 3, - ignore_missing: bool = True, - ) -> Optional[Union[float, str]]: - s3_obj_details: List = [cls.get_s3_object_details(s3_fpath) for s3_fpath in as_list(s3_path)] - if any_are_none(*s3_obj_details) and not ignore_missing: - return None - size_in_bytes: int = int( - sum([s3_resp["ContentLength"] for s3_resp in s3_obj_details if s3_resp is not None]) - ) - if unit is not None: - return String.convert_size_from_bytes(size_in_bytes, unit=unit, decimals=decimals) - return String.readable_bytes(size_in_bytes, decimals=decimals) - - @classmethod - def list_recursive_objects_in_dir(cls, *args, **kwargs) -> List[str]: - return cls.list(*args, **kwargs) - - @classmethod - def list( - cls, - s3_path: str, - *, - file_glob: str = String.DOUBLE_ASTERISK, - ignored_files: Union[str, List[str]] = String.FILES_TO_IGNORE, - **kwargs, - ) -> List[str]: - ignored_files: List[str] = as_list(ignored_files) - s3 = boto3.resource("s3") - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3_bucket_resource = s3.Bucket(s3_bucket) - objs_in_dir: List[str] = [ - obj_path for obj_path in s3_bucket_resource.objects.filter(Prefix=object_key) - ] - if len(objs_in_dir) == 0: - return [] - objs_in_dir: List[str] = [ - os.path.join(String.S3_PREFIX, obj_path.bucket_name, obj_path.key) for obj_path in objs_in_dir - ] - objs_in_dir: List[str] = [ - obj_path - for obj_path in objs_in_dir - if fnmatch.fnmatch(String.remove_prefix(obj_path, s3_path), file_glob) - ] - obj_names_map: Dict[str, str] = {obj_path: os.path.basename(obj_path) for obj_path in objs_in_dir} - obj_names_map = remove_values(obj_names_map, ignored_files) - objs_in_dir = list(obj_names_map.keys()) - return objs_in_dir - - @classmethod - def list_subdirs_in_dir(cls, *args, **kwargs) -> List[str]: - return cls.list_subdirs(*args, **kwargs) - - @classmethod - def list_subdirs( - cls, - s3_path: str, - *, - names_only: bool = False, - ) -> List[str]: - s3 = boto3.resource("s3") - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - if not object_key.endswith(String.SLASH): - object_key += String.SLASH - s3_bucket_resource = s3.Bucket(s3_bucket) - paginator = s3_bucket_resource.meta.client.get_paginator("list_objects") - pagination_params: Dict = dict(Prefix=object_key, Delimiter=String.SLASH) - subdirs: List[str] = [] - ## Ref: https://stackoverflow.com/a/51372405 - for resp in paginator.paginate(Bucket=s3_bucket_resource.name, **pagination_params): - if "CommonPrefixes" in resp: - subdirs.extend( - [ - os.path.join(String.S3_PREFIX, s3_bucket, f["Prefix"]) - if not names_only - else String.remove_suffix( - String.remove_prefix(f["Prefix"], prefix=object_key), - suffix=String.SLASH, - ) - for f in resp["CommonPrefixes"] - ] - ) - return sorted(subdirs) - - @classmethod - def touch_s3_object(cls, s3_path: str): - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3 = boto3.client("s3") - s3_resp = s3.put_object(Bucket=s3_bucket, Key=object_key) - return s3_resp - - @classmethod - def get_s3_object_str(cls, s3_path: str, retry: int = 1) -> str: - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - out_str = cls.stream_s3_object(s3_path, retry=retry).read().decode("utf-8") - if String.is_empty(out_str): - raise IOError(f'Object in bucket "{s3_bucket}" with key "{object_key}" seems to be empty') - return out_str - - @classmethod - def get_s3_object_pickle(cls, s3_path: str, retry: int = 1) -> Any: - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - pickled_data = cls.stream_s3_object(s3_path, retry=retry).read() - loaded_data = pickle.loads(pickled_data) - if loaded_data is None: - raise IOError(f'Object in bucket "{s3_bucket}" with key "{object_key}" seems to be empty') - return loaded_data - - @classmethod - def stream_s3_object(cls, s3_path: str, retry: int = 1) -> io.IOBase: - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3 = boto3.resource("s3") - retry_wait_times = cls.__get_retry_wait_times_list(retry) - for retry_wait_time in retry_wait_times: - try: - time.sleep(retry_wait_time) - obj = s3.Object(s3_bucket, object_key) - stream = obj.get()["Body"] - return stream - except Exception as e: - Log.debug(String.format_exception_msg(e)) - if retry_wait_time != retry_wait_times[-1]: - Log.debug( - f'Retrying retrieval of object at bucket "{s3_bucket}" with key "{object_key}"...' - ) - raise IOError( - f'Cannot retrieve S3 object after {retry} attempts; bucket="{s3_bucket}", key="{object_key}"' - ) - - @classmethod - def put_s3_object_str( - cls, - s3_path: str, - obj_str: str, - overwrite: bool = True, - num_attempts: int = 1, - ): - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3 = boto3.client("s3") - retry_wait_times = cls.__get_retry_wait_times_list(num_attempts) - for retry_wait_time in retry_wait_times: - try: - time.sleep(retry_wait_time) - if cls.s3_object_exists(s3_path) and overwrite is False: - raise FileExistsError( - f"File already exists at {s3_path}, set overwrite=True to overwrite it." - ) - s3.put_object(Body=obj_str, Bucket=s3_bucket, Key=object_key) - if cls.s3_object_exists(s3_path) is False: - raise IOError(f"Could not put object at bucket {s3_bucket} and key {object_key}") - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - if retry_wait_time != retry_wait_times[-1]: - Log.info(f"Retrying put of object at bucket {s3_bucket} and key {object_key}...") - raise IOError( - "Could not successfully put object at bucket %s and key %s, after %s attempts" - % (s3_bucket, object_key, num_attempts) - ) - - @classmethod - def put_s3_object_pickle( - cls, - s3_path: str, - obj_data: Any, - overwrite: bool = True, - num_attempts: int = 1, - ): - s3_bucket, object_key = cls.s3_path_exploder(s3_path) - s3 = boto3.client("s3") - retry_wait_times = cls.__get_retry_wait_times_list(num_attempts) - for retry_wait_time in retry_wait_times: - try: - time.sleep(retry_wait_time) - if cls.s3_object_exists(s3_path) and overwrite is False: - raise FileExistsError( - f"File already exists at {s3_path}, set overwrite=True to overwrite it." - ) - serialized_data = pickle.dumps(obj_data) - s3.put_object(Body=serialized_data, Bucket=s3_bucket, Key=object_key) - if cls.s3_object_exists(s3_path) is False: - raise IOError(f"Could not put object at bucket {s3_bucket} and key {object_key}") - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - if retry_wait_time != retry_wait_times[-1]: - Log.info(f"Retrying put of object at bucket {s3_bucket} and key {object_key}...") - raise IOError( - "Could not successfully put object at bucket %s and key %s, after %s attempts" - % (s3_bucket, object_key, num_attempts) - ) - - @classmethod - def copy_local_file_to_s3(cls, source_local_path: str, destination_s3_path: str, extra_args=None) -> bool: - if extra_args is None: - extra_args = {} - try: - FileSystemUtil.check_file_exists(source_local_path) - s3 = boto3.resource("s3") - ## Ref: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_file - destination_s3_bucket, destination_object_key = cls.s3_path_exploder(destination_s3_path) - s3.meta.client.upload_file( - source_local_path, destination_s3_bucket, destination_object_key, extra_args - ) - assert cls.s3_object_exists(destination_s3_path), ( - f"Could not find file {destination_s3_path} after copying" - ) - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - return False - - @classmethod - def copy_local_dir_to_s3( - cls, - source_local_dir: str, - destination_s3_dir: str, - log: bool = False, - extra_args=None, - ) -> bool: - if not FileSystemUtil.dir_exists(source_local_dir): - raise OSError(f'Source directory does not exist: "{source_local_dir}"') - source_local_dir: str = FileSystemUtil.expand_dir(source_local_dir) - local_fpaths: List[str] = FileSystemUtil.list(source_local_dir, recursive=True, only_files=True) - if not cls.is_path_valid_s3_dir(destination_s3_dir): - raise OSError(f'Destination is not a valid S3 directory: "{destination_s3_dir}"') - for local_fpath in shuffle_items(local_fpaths): - local_fname: str = local_fpath.replace(source_local_dir, "") - s3_fpath: str = cls.construct_path_in_s3_dir(destination_s3_dir, name=local_fname, is_dir=False) - if log: - Log.info(f'Uploading file from "{local_fname}" to "{s3_fpath}"...') - try: - with Timer(silent=True) as timer: - cls.copy_local_file_to_s3( - source_local_path=local_fpath, - destination_s3_path=s3_fpath, - extra_args=extra_args, - ) - if log: - Log.info( - f'...uploaded file from "{local_fpath}" to "{s3_fpath}" in {timer.time_taken_str}.' - ) - except Exception as e: - Log.error(String.format_exception_msg(e)) - return False - return True - - @classmethod - def copy_s3_file_to_local(cls, source_s3_path: str, destination_local_path: str, extra_args=None) -> bool: - if extra_args is None: - extra_args = {} - try: - s3 = boto3.resource("s3") - assert cls.s3_object_exists(source_s3_path), f"Could not find file {source_s3_path} to copy" - ## Ref: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.download_file - source_s3_bucket, source_object_key = cls.s3_path_exploder(source_s3_path) - s3.meta.client.download_file( - source_s3_bucket, source_object_key, destination_local_path, extra_args - ) - FileSystemUtil.check_file_exists(destination_local_path) - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - return False - - @classmethod - def copy_s3_dir_to_local( - cls, - source_s3_dir: str, - destination_local_dir: str, - force_download: bool = False, - log: bool = False, - wait_timeout: int = 300, - extra_args=None, - ) -> bool: - s3_fpaths: List[str] = cls.list_recursive_objects_in_dir(source_s3_dir) - s3_fnames: Set[str] = {s3_fpath.split(String.SLASH)[-1] for s3_fpath in s3_fpaths} - destination_local_dir: str = FileSystemUtil.expand_dir(destination_local_dir) - local_fpaths: List[str] = FileSystemUtil.list(destination_local_dir) - if force_download: - local_fnames: Set[str] = set() - else: - local_fnames: Set[str] = {local_fpath.split(os.path.sep)[-1] for local_fpath in local_fpaths} - if local_fnames < s3_fnames: - s3_fpaths_to_download: List[str] = [ - s3_fpath - for s3_fpath in s3_fpaths - if s3_fpath.split(String.SLASH)[-1] in (s3_fnames - local_fnames) - ] - if len(s3_fpaths_to_download) == 0: - return True - time.sleep( - random.randint(0, 30000) / 1000 - ) ## Wait randomly between 0 and 30 seconds to acquire locks. - lock_file: str = os.path.join(destination_local_dir, f"{_LIBRARY_NAME}-download.lock") - if not FileSystemUtil.file_exists(lock_file): - ## Acquire lock: - try: - FileSystemUtil.touch_file(lock_file) - ## If we don't have a file, download a copy: - if log: - Log.info(f"Downloading {len(s3_fpaths_to_download)} files from S3...") - for s3_fpath_to_download in s3_fpaths_to_download: - fname: str = s3_fpath_to_download.split(String.SLASH)[-1] - local_fpath: str = os.path.join(destination_local_dir, fname) - if log: - Log.info(f'Downloading file from "{s3_fpath_to_download}" to "{local_fpath}"...') - with Timer(silent=True) as timer: - cls.copy_s3_file_to_local( - s3_fpath_to_download, local_fpath, extra_args=extra_args - ) - if log: - Log.info( - f'...downloaded file from "{s3_fpath_to_download}" to "{local_fpath}" ' - f"in {timer.time_taken_str}." - ) - finally: - FileSystemUtil.rm_file(lock_file) - else: - for i in range(wait_timeout // 10): - if not FileSystemUtil.file_exists(lock_file): - return True - time.sleep(10) - raise SystemError( - f"Waited for {wait_timeout} sec but still not completed downloading files to " - f'"{destination_local_dir}". Files which we had started downloading: {s3_fpaths_to_download}' - ) - - return True - - @classmethod - def copy_file_between_s3_locations( - cls, source_s3_path: str, destination_s3_path: str, extra_args=None - ) -> bool: - if extra_args is None: - extra_args = {} - try: - s3 = boto3.resource("s3") - assert cls.s3_object_exists(source_s3_path) - ## Ref: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.copy - destination_s3_bucket, destination_object_key = cls.s3_path_exploder(destination_s3_path) - s3.meta.client.copy( - cls.s3_path_exploder_dict(source_s3_path), - destination_s3_bucket, - destination_object_key, - extra_args, - ) - assert cls.s3_object_exists(destination_s3_path), ( - f"Could not find file {destination_s3_path} after copying" - ) - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - return False - - @classmethod - def copy_dir_between_s3_locations( - cls, source_s3_dir: str, destination_s3_dir: str, extra_args=None - ) -> bool: - if not cls.is_path_valid_s3_dir(source_s3_dir): - raise ValueError(f'Invalid s3 source directory: "{source_s3_dir}"') - if not cls.is_path_valid_s3_dir(destination_s3_dir): - raise ValueError(f'Invalid s3 destination directory: "{destination_s3_dir}"') - source_s3_fpaths: List[str] = cls.list(source_s3_dir) - try: - for source_s3_fpath in source_s3_fpaths: - if source_s3_fpath.endswith(String.SLASH): - continue ## Only copy files - source_s3_fname: str = source_s3_fpath.replace(source_s3_dir, "") - destination_s3_fpath: str = cls.construct_path_in_s3_dir( - destination_s3_dir, - name=source_s3_fname, - is_dir=False, - ) - cls.copy_file_between_s3_locations( - source_s3_fpath, destination_s3_fpath, extra_args=extra_args - ) - return True - except Exception as e: - Log.error(String.format_exception_msg(e)) - return False - - @staticmethod - def construct_path_in_s3_dir( - s3_path: str, - name: str, - is_dir: bool, - file_ending: Optional[str] = None, - ): - """ - If the path is a dir, uses the inputs to construct a file path. If path is a file, returns unchanged. - :param s3_path: path to dir (or file) in S3. - :param name: name of the file. - :param is_dir: whether the newly created path should be a directory or file. - :param file_ending: (optional) a string of the file ending. - :return: file path string. - """ - if S3Util.is_path_valid_s3_dir(s3_path): - file_name: str = String.assert_not_empty_and_strip(name) - if file_ending is not None: - file_name += String.assert_not_empty_and_strip(file_ending) - if s3_path.endswith(String.SLASH): - out_s3_path: str = String.EMPTY.join([s3_path, file_name]) - else: - out_s3_path: str = String.SLASH.join([s3_path, file_name]) - if is_dir and not out_s3_path.endswith(String.SLASH): - ## Add a slash at the end: - out_s3_path += String.SLASH - return out_s3_path - else: - return s3_path - - @classmethod - def __get_retry_wait_times_list(cls, num_attempts): - return [(math.pow(2, i - 1) - 1) for i in range(1, num_attempts + 1)] ## 0, 1, 3, 7, 15, ... - - @classmethod - def generate_presigned_s3_url(cls, s3_path: str, expiry: int = 7 * 24 * 60 * 60) -> str: - url = boto3.client("s3").generate_presigned_url( - ClientMethod="get_object", - Params=S3Util.s3_path_exploder_dict(s3_path), - ExpiresIn=expiry, - ## Max expiry time, see: X-Amz-Expires here: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html - ) - return url diff --git a/src/fmcore/util/concurrency/__init__.py b/src/fmcore/util/concurrency/__init__.py deleted file mode 100644 index b4eb00b..0000000 --- a/src/fmcore/util/concurrency/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""A collection of concurrency utilities to augment the Python language:""" -from fmcore.util.concurrency._asyncio import * -from fmcore.util.concurrency._daemon import * -from fmcore.util.concurrency._dispatch import * -from fmcore.util.concurrency._processes import * -from fmcore.util.concurrency._ray import * -from fmcore.util.concurrency._threads import * -from fmcore.util.concurrency._utils import * diff --git a/src/fmcore/util/concurrency/_asyncio.py b/src/fmcore/util/concurrency/_asyncio.py deleted file mode 100644 index 5e43e4b..0000000 --- a/src/fmcore/util/concurrency/_asyncio.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Jupyter-friendly asyncio usage:""" - -import asyncio -import atexit -import inspect -import threading -from functools import partial - - -def _asyncio_start_event_loop(loop): - asyncio.set_event_loop(loop) - loop.run_forever() - - -## Async wrapper to run a synchronous or asynchronous function in the event loop -async def __run_fn_async(fn, *args, run_sync_in_executor: bool = True, **kwargs): - if inspect.iscoroutinefunction(fn): - ## If fn is defined with `def async`, run this using asyncio mechanism, - ## meaning code inside fn is run in an sync way, except for the "await"-marked lines, which will - ## be run asynchronously. Note that "await"-marked lines must call other functions defined using "def async". - result = await fn(*args, **kwargs) - else: - ## The function is a regular synchronous function. - if run_sync_in_executor: - ## Run in the default executor (thread pool) for the event loop, otherwise it blocks the event loop - ## until the function execution completes. - ## The executor lives for the lifetime of the event loop. Ref: https://stackoverflow.com/a/33399896/4900327 - ## This basically is the same as run_concurrent, but with no control on the number of threads. - loop = asyncio.get_running_loop() - result = await loop.run_in_executor(None, partial(fn, *args, **kwargs)) - else: - ## Run the synchronous function directly in the event loop. - ## This will block the event loop until the function execution is complete, - ## preventing other tasks from running during this time. - result = fn(*args, **kwargs) - return result - - -## Function to submit the coroutine to the asyncio event loop -def run_asyncio(fn, *args, **kwargs): - ## Create a coroutine (i.e. Future), but do not actually start executing it. - coroutine = __run_fn_async(fn, *args, **kwargs) - ## Schedule the coroutine to execute on the event loop (which is running on thread _ASYNCIO_EVENT_LOOP_THREAD). - return asyncio.run_coroutine_threadsafe(coroutine, _ASYNCIO_EVENT_LOOP) - - -async def async_http_get(url): - import aiohttp - - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - return await response.read() - - -## Create a new loop and a thread running this loop -_ASYNCIO_EVENT_LOOP = asyncio.new_event_loop() -_ASYNCIO_EVENT_LOOP_THREAD = threading.Thread(target=_asyncio_start_event_loop, args=(_ASYNCIO_EVENT_LOOP,)) -_ASYNCIO_EVENT_LOOP_THREAD.start() - - -def _cleanup_event_loop(): - if _ASYNCIO_EVENT_LOOP.is_running(): - _ASYNCIO_EVENT_LOOP.call_soon_threadsafe(_ASYNCIO_EVENT_LOOP.stop) - _ASYNCIO_EVENT_LOOP_THREAD.join() - _ASYNCIO_EVENT_LOOP.close() - - -## Register the cleanup function to be called upon Python program exit -atexit.register(_cleanup_event_loop) diff --git a/src/fmcore/util/concurrency/_daemon.py b/src/fmcore/util/concurrency/_daemon.py deleted file mode 100644 index 83d686d..0000000 --- a/src/fmcore/util/concurrency/_daemon.py +++ /dev/null @@ -1,120 +0,0 @@ -import logging -import time -import traceback -from datetime import datetime -from typing import * - - -def daemon(wait: float, exit_on_error: bool = False, sentinel: Optional[List] = None, **kwargs): - """ - A decorator which runs a function as a daemon process in a background thread. - - You do not need to invoke this function directly: simply decorating the daemon function will start running it - in the background. - - Example using class method: your daemon should be marked with @staticmethod. Example: - class Printer: - DATA_LIST = [] - @staticmethod - @daemon(wait=3, mylist=DATA_LIST) - def printer_daemon(mylist): - if len(mylist) > 0: - print(f'Contents of list: {mylist}', flush=True) - - Example using sentinel: - run_sentinel = [True] - @daemon(wait=1, sentinel=run_sentinel) - def run(): - print('Running', flush=True) - time.sleep(3) ## Prints "Running" 3 times. - run_sentinel.pop() ## Stops "Running" from printing any more. - - :param wait: the wait time in seconds between invocations to the @daemon decorated function. - :param exit_on_error: whether to stop the daemon if an error is raised. - :sentinel: can be used to stop the executor. When not passed, the daemon runs forever. When passed, `sentinel` must - be a list with exactly one element (it can be anything). To stop the daemon, run "sentinel.pop()". It is - important to pass a list (not a tuple), since lists are mutable, and thus the same exact object is used by - both the executor and by the caller. - :param kwargs: list of arguments passed to the decorator, which are forwarded to the decorated function as kwargs. - These values will never change for the life of the daemon. However, if you pass references to mutables such as - lists, dicts, objects etc to the decorator and use them in the daemon function, you can run certain tasks at a - regular cadence on fresh data. - :return: None - """ - - ## Refs on how decorators work: - ## 1. https://www.datacamp.com/community/tutorials/decorators-python - def decorator(function): - ## Each decorated function gets its own executor. These are defined at the function-level, so - ## if you write two decorated functions `def say_hi` and `def say_bye`, they each gets a separate - ## executor. The executor for `say_hi` will call `say_hi` repeatedly, and the executor for `say_bye` will call - ## `say_bye` repeatedly; they will not interact. - executor = RestrictedConcurrencyThreadPoolExecutor(max_workers=1) - - def run_function_forever(sentinel): - while sentinel is None or len(sentinel) > 0: - start = time.perf_counter() - try: - function(**kwargs) - except Exception as e: - logging.debug(traceback.format_exc()) - if exit_on_error: - raise e - end = time.perf_counter() - time_to_wait: float = max(0.0, wait - (end - start)) - time.sleep(time_to_wait) - del executor ## Cleans up the daemon after it finishes running. - - if sentinel is not None: - if not isinstance(sentinel, list) or len(sentinel) != 1: - raise ValueError("When passing `sentinel`, it must be a list with exactly one item.") - executor.submit(run_function_forever, sentinel=sentinel) - - ## The wrapper here should do nothing, since you cannot call the daemon explicitly. - def wrapper(*args, **kwargs): - raise RuntimeError("Cannot call daemon function explicitly") - - return wrapper - - return decorator - - -## Dict of daemon ids to their sentinels -_DAEMONS: Dict[str, List[bool]] = {} - - -def start_daemon( - fn, - wait: float, - daemon_id: Optional[str] = None, - daemons: Dict[str, List[bool]] = _DAEMONS, - **kwargs, -) -> str: - assert isinstance(daemons, dict) - assert isinstance(wait, (int, float)) and wait >= 0.0 - if daemon_id is None: - dt: datetime = datetime.now() - dt: datetime = dt.replace(tzinfo=dt.astimezone().tzinfo) - if dt.tzinfo is not None: - daemon_id: str = dt.strftime("%Y-%m-%d %H:%M:%S.%f UTC%z").strip() - else: - daemon_id: str = dt.strftime("%Y-%m-%d %H:%M:%S.%f").strip() - assert isinstance(daemon_id, str) and len(daemon_id) > 0 - assert daemon_id not in daemons, f'Daemon with id "{daemon_id}" already exists.' - - daemon_sentinel: List[bool] = [True] - - @daemon(wait=wait, sentinel=daemon_sentinel) - def run(): - fn(**kwargs) - - daemons[daemon_id] = daemon_sentinel - return daemon_id - - -def stop_daemon(daemon_id: str, daemons: Dict[str, List[bool]] = _DAEMONS) -> bool: - assert isinstance(daemons, dict) - assert isinstance(daemon_id, str) and len(daemon_id) > 0 - daemon_sentinel: List[bool] = daemons.pop(daemon_id, [False]) - assert len(daemon_sentinel) == 1 - return daemon_sentinel.pop() diff --git a/src/fmcore/util/concurrency/_dispatch.py b/src/fmcore/util/concurrency/_dispatch.py deleted file mode 100644 index e063699..0000000 --- a/src/fmcore/util/concurrency/_dispatch.py +++ /dev/null @@ -1,400 +0,0 @@ -import time -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from concurrent.futures._base import Executor -from typing import * - -import numpy as np -import pandas as pd -from pydantic import Extra, root_validator - -from fmcore.constants.DataProcessingConstants import Parallelize -from fmcore.util.language import ( - Alias, - Parameters, - ProgressBar, - filter_kwargs, - get_default, - is_dict_like, - is_list_or_set_like, - set_param_from_alias, - type_str, -) - -from ._asyncio import run_asyncio -from ._processes import ActorPoolExecutor, ActorProxy, run_parallel -from ._ray import RayPoolExecutor, run_parallel_ray -from ._threads import ( - RestrictedConcurrencyThreadPoolExecutor, - kill_thread, - run_concurrent, - suppress_ThreadKilledSystemException, -) -from ._utils import ( - _LOCAL_ACCUMULATE_ITEM_WAIT, - _LOCAL_ACCUMULATE_ITER_WAIT, - _RAY_ACCUMULATE_ITEM_WAIT, - _RAY_ACCUMULATE_ITER_WAIT, - accumulate, - accumulate_iter, -) - - -def worker_ids( - executor: Optional[Union[ThreadPoolExecutor, ProcessPoolExecutor, ActorPoolExecutor]], -) -> Set[int]: - ## Returns a set of unique identifiers for all workers in the given executor - ## Input: executor - any supported pool executor (Thread, Process, or Actor) - ## Output: Set of thread IDs or process IDs depending on executor type - - if isinstance(executor, ThreadPoolExecutor): - ## For thread pools, return set of thread identifiers - return {th.ident for th in executor._threads} - elif isinstance(executor, ProcessPoolExecutor): - ## For process pools, return set of process IDs - return {p.pid for p in executor._processes.values()} - elif isinstance(executor, ActorPoolExecutor): - ## For actor pools, return set of actor process IDs - return {_actor._process.pid for _actor in executor._actors} - - ## Raise error if executor type is not supported - raise NotImplementedError(f"Cannot get worker ids for executor of type: {executor}") - - -class ExecutorConfig(Parameters): - """ - Configuration class for parallel execution settings used by dispatch functions. - Provides a structured way to define parallelization strategy and execution constraints. - - Attributes: - parallelize: Type of parallelization to use (sync, threads, processes, ray) - max_workers: Maximum number of parallel workers (None uses system defaults) - max_calls_per_second: Rate limiting for execution calls (infinity means no limit) - - Example usage: - >>> config = ExecutorConfig( - parallelize='threads', - max_workers=4, - max_calls_per_second=100.0 - ) - >>> executor = dispatch_executor(config=config) - - # Using with num_workers alias - >>> config = ExecutorConfig( - parallelize='processes', - num_workers=8 # alias for max_workers - ) - """ - - class Config(Parameters.Config): - extra = Extra.ignore ## Silently ignore any extra parameters for flexibility - - parallelize: Parallelize - max_workers: Optional[int] = None ## None lets the executor use system-appropriate defaults - max_calls_per_second: float = float("inf") ## No rate limiting by default - - @root_validator(pre=True) - def _set_params(cls, params: Dict) -> Dict: - """ - Pre-processes configuration parameters to support alternate parameter names. - Allows 'num_workers' as an alias for 'max_workers' for compatibility. - """ - set_param_from_alias(params, param="max_workers", alias=["num_workers"], default=None) - return params - - -def dispatch( - fn: Callable, - *args, - parallelize: Parallelize, - forward_parallelize: bool = False, - delay: float = 0.0, - executor: Optional[Executor] = None, - **kwargs, -) -> Any: - parallelize: Parallelize = Parallelize.from_str(parallelize) - if forward_parallelize: - kwargs["parallelize"] = parallelize - time.sleep(delay) - if parallelize is Parallelize.sync: - return fn(*args, **kwargs) - elif parallelize is Parallelize.asyncio: - return run_asyncio(fn, *args, **kwargs) - elif parallelize is Parallelize.threads: - return run_concurrent(fn, *args, executor=executor, **kwargs) - elif parallelize is Parallelize.processes: - return run_parallel(fn, *args, executor=executor, **kwargs) - elif parallelize is Parallelize.ray: - return run_parallel_ray(fn, *args, executor=executor, **kwargs) - raise NotImplementedError(f"Unsupported parallelization: {parallelize}") - - -def dispatch_executor( - *, config: Optional[Union[ExecutorConfig, Dict]] = None, **kwargs -) -> Optional[Executor]: - """ - Creates and configures an executor based on the provided configuration settings. - Returns None for synchronous execution or when using default system executors. - - The executor handles parallel task execution with configurable constraints like - maximum workers and rate limiting for thread-based execution. - - Args: - config: ExecutorConfig instance or dict containing parallelization settings - **kwargs: Additional configuration parameters that override config values - - Returns: - Configured executor instance or None if using defaults/sync execution - - Example usage: - >>> config = ExecutorConfig( - parallelize='threads', - max_workers=4, - max_calls_per_second=100.0 - ) - >>> executor = dispatch_executor(config=config) - - >>> executor = dispatch_executor( - config=dict(parallelize='processes', max_workers=8) - ) - """ - if config is None: - config: Dict = dict() - else: - assert isinstance(config, ExecutorConfig) - config: Dict = config.dict(exclude=True) - - ## Merge passed kwargs with config dict to allow parameter overrides - config: ExecutorConfig = ExecutorConfig(**{**config, **kwargs}) - - if config.max_workers is None: - ## Return None to use system defaults - this is more efficient for simple cases - return None - - if config.parallelize is Parallelize.sync: - return None - elif config.parallelize is Parallelize.threads: - ## Use restricted concurrency for threads to enable rate limiting - return RestrictedConcurrencyThreadPoolExecutor( - max_workers=config.max_workers, - max_calls_per_second=config.max_calls_per_second, - ) - elif config.parallelize is Parallelize.processes: - ## Actor-based pool enables better control over process lifecycle - return ActorPoolExecutor( - max_workers=config.max_workers, - ) - elif config.parallelize is Parallelize.ray: - ## Ray executor for distributed execution across multiple machines - return RayPoolExecutor( - max_workers=config.max_workers, - ) - else: - raise NotImplementedError( - f"Unsupported: you cannot create an executor with {config.parallelize} parallelization." - ) - - -def dispatch_apply( - struct: Union[List, Tuple, np.ndarray, pd.Series, Set, frozenset, Dict], - *args, - fn: Callable, - parallelize: Parallelize, - forward_parallelize: bool = False, - item_wait: Optional[float] = None, - iter_wait: Optional[float] = None, - iter: bool = False, - **kwargs, -) -> Any: - """ - Applies a function to each element in a data structure in parallel using the specified execution strategy. - Similar to map() but with parallel execution capabilities and progress tracking. - - The function handles different types of parallel execution: - - Synchronous (single-threaded) - - Asyncio-based concurrent execution (for low-latency async/await functions) - - Thread-based parallelism (for IO-bound tasks) - - Process-based parallelism (for CPU-bound tasks) - - Ray-based distributed execution (for multi-machine execution) - - Args: - struct: Input data structure to iterate over. Can be list-like or dict-like - *args: Additional positional args passed to each fn call - fn: Function to apply to each element - parallelize: Execution strategy (sync, threads, processes, ray, asyncio) - forward_parallelize: If True, passes the parallelize strategy to fn - item_wait: Delay between submitting individual items (rate limiting) - iter_wait: Delay between checking completion of submitted items - iter: If True, returns an iterator that yields results as they complete - **kwargs: Additional keyword args passed to each fn call - - Example usage: - >>> data = [1, 2, 3, 4, 5] - >>> def square(x): - return x * x - - >>> ## Process items in parallel using threads - >>> results = dispatch_apply( - data, - fn=square, - parallelize='threads', - max_workers=4 - ) - - >>> ## Process dictionary items using processes - >>> data = {'a': 1, 'b': 2, 'c': 3} - >>> results = dispatch_apply( - data, - fn=square, - parallelize='processes', - progress_bar=True - ) - """ - ## Convert string parallelization strategy to enum - parallelize: Parallelize = Parallelize.from_str(parallelize) - - ## Set appropriate wait times based on execution strategy: - ## - Sync/asyncio don't need waits since they're single-threaded - ## - Local execution (threads/processes) can use shorter waits - ## - Ray execution needs longer waits due to distributed nature - item_wait: float = get_default( - item_wait, - { - Parallelize.ray: _RAY_ACCUMULATE_ITEM_WAIT, - Parallelize.processes: _LOCAL_ACCUMULATE_ITEM_WAIT, - Parallelize.threads: _LOCAL_ACCUMULATE_ITEM_WAIT, - Parallelize.asyncio: 0.0, - Parallelize.sync: 0.0, - }[parallelize], - ) - iter_wait: float = get_default( - iter_wait, - { - Parallelize.ray: _RAY_ACCUMULATE_ITER_WAIT, - Parallelize.processes: _LOCAL_ACCUMULATE_ITER_WAIT, - Parallelize.threads: _LOCAL_ACCUMULATE_ITER_WAIT, - Parallelize.asyncio: 0.0, - Parallelize.sync: 0.0, - }[parallelize], - ) - - ## Forward parallelization strategy to child function if requested: - if forward_parallelize: - kwargs["parallelize"] = parallelize - - ## Create appropriate executor based on parallelization strategy: - executor: Optional = dispatch_executor( - parallelize=parallelize, - **kwargs, - ) - - try: - ## Configure progress bars for both submission and collection phases. - ## Default to showing progress unless explicitly disabled: - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - submit_pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(struct), - desc="Submitting", - prefer_kwargs=False, - unit="item", - ) - collect_pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(struct), - desc="Collecting", - prefer_kwargs=False, - unit="item", - ) - - ## Handle list-like structures (lists, tuples, sets, arrays): - if is_list_or_set_like(struct): - futs = [] - for v in struct: - ## Wrap user function to handle item-level execution - def submit_task(item, **dispatch_kwargs): - return fn(item, **dispatch_kwargs) - - ## Submit task for parallel execution with rate limiting (item_wait): - futs.append( - dispatch( - fn=submit_task, - item=v, - parallelize=parallelize, - executor=executor, - delay=item_wait, - **filter_kwargs(fn, **kwargs), - ) - ) - submit_pbar.update(1) - - ## Handle dictionary-like structures: - elif is_dict_like(struct): - futs = {} - for k, v in struct.items(): - - def submit_task(item, **dispatch_kwargs): - return fn(item, **dispatch_kwargs) - - ## Submit task with key for maintaining dict structure: - futs[k] = dispatch( - fn=submit_task, - key=k, - item=v, - parallelize=parallelize, - executor=executor, - delay=item_wait, - **filter_kwargs(fn, **kwargs), - ) - submit_pbar.update(1) - else: - raise NotImplementedError(f"Unsupported type: {type_str(struct)}") - - submit_pbar.success() - - ## Return results either as iterator or all-at-once (afer accumulating all futures): - if iter: - return accumulate_iter( - futs, item_wait=item_wait, iter_wait=iter_wait, progress_bar=collect_pbar, **kwargs - ) - else: - return accumulate( - futs, item_wait=item_wait, iter_wait=iter_wait, progress_bar=collect_pbar, **kwargs - ) - finally: - ## Ensure executor is properly cleaned up even if processing fails: - stop_executor(executor) - - -def stop_executor( - executor: Optional[Executor], - force: bool = True, ## Forcefully terminate, might lead to work being lost. -): - if executor is not None: - if isinstance(executor, ThreadPoolExecutor): - suppress_ThreadKilledSystemException() - if force: - executor.shutdown(wait=False) ## Cancels pending items - for tid in worker_ids(executor): - kill_thread(tid) ## Note; after calling this, you can still submit - executor.shutdown(wait=False) ## Note; after calling this, you cannot submit - else: - executor.shutdown(wait=True) - del executor - elif isinstance(executor, ProcessPoolExecutor): - if force: - for process in executor._processes.values(): # Internal Process objects - process.terminate() # Forcefully terminate the process - - # Wait for the processes to clean up - for process in executor._processes.values(): - process.join() - executor.shutdown(wait=True, cancel_futures=True) - else: - executor.shutdown(wait=True, cancel_futures=True) - del executor - elif isinstance(executor, ActorPoolExecutor): - for actor in executor._actors: - assert isinstance(actor, ActorProxy) - actor.stop(cancel_futures=force) - del actor - del executor diff --git a/src/fmcore/util/concurrency/_processes.py b/src/fmcore/util/concurrency/_processes.py deleted file mode 100644 index 60bfa66..0000000 --- a/src/fmcore/util/concurrency/_processes.py +++ /dev/null @@ -1,405 +0,0 @@ -import multiprocessing as mp -import queue -import random -import threading -import traceback -import uuid -import warnings -from concurrent.futures import ProcessPoolExecutor -from concurrent.futures._base import Executor, Future -from concurrent.futures.process import BrokenProcessPool -from typing import * - -import cloudpickle - -from fmcore.constants.DataProcessingConstants import Status - -from ._utils import LoadBalancingStrategy - - -def actor_process_main(cls_bytes, init_args, init_kwargs, command_queue, result_queue): - cls = cloudpickle.loads(cls_bytes) - instance = None - while True: - command = command_queue.get() - if command is None: - break - request_id, method_name, args, kwargs = command - try: - if method_name == "__initialize__": - instance = cls(*init_args, **init_kwargs) - result_queue.put((request_id, "ok", None)) - continue - if instance is None: - raise RuntimeError("Actor instance not initialized.") - method = getattr(instance, method_name, None) - if method is None: - raise AttributeError(f"Method '{method_name}' not found.") - result = method(*args, **kwargs) - result_queue.put((request_id, "ok", result)) - except Exception as e: - tb_str = traceback.format_exc() - result_queue.put((request_id, "error", (e, tb_str))) - - -class ActorProxy: - def __init__(self, cls, init_args, init_kwargs, mp_context: Literal["fork", "spawn"]): - assert mp_context in {"fork", "spawn"} - ctx = mp.get_context(mp_context) - - self._uuid = str(uuid.uuid4()) - - self._command_queue = ctx.Queue() - self._result_queue = ctx.Queue() - self._num_submitted: int = 0 - self._task_status: Dict[Status, int] = { - Status.PENDING: 0, - Status.RUNNING: 0, - Status.SUCCEEDED: 0, - Status.FAILED: 0, - } - - self._futures = {} - self._futures_lock = threading.Lock() - - # Create the process using the fork context - cls_bytes = cloudpickle.dumps(cls) - self._cls_name = cls.__name__ - self._process: ctx.Process = ctx.Process( - target=actor_process_main, - args=( - cls_bytes, - init_args, - init_kwargs, - self._command_queue, - self._result_queue, - ), - ) - self._process.start() - - # Synchronous initialization - self._invoke_sync_initialize() - - self._stopped = False - - # Now start the asynchronous result handling using a thread: - self._result_thread = threading.Thread(target=self._handle_results, daemon=True) - self._result_thread.start() - - def _handle_results(self): - while True: - if not self._process.is_alive() and self._result_queue.empty(): - self._task_status[Status.RUNNING] = 0 - return - try: - item = self._result_queue.get(timeout=1) - except queue.Empty: - self._task_status[Status.RUNNING] = 0 - continue - if item is None: # Sentinel to stop the results-handling thread. - return - request_id, status, payload = item - with self._futures_lock: - future = self._futures.pop(request_id, None) - if future is not None: - if status == "ok": - future.set_result(payload) - self._task_status[Status.SUCCEEDED] += 1 - else: - e, tb_str = payload - future.set_exception(RuntimeError(f"Remote call failed:\n{tb_str}")) - self._task_status[Status.FAILED] += 1 - self._task_status[Status.PENDING] -= 1 - - def _invoke_sync_initialize(self): - request_id = self._uuid - self._command_queue.put((request_id, "__initialize__", (), {})) - # Direct, blocking call to get the response - rid, status, payload = self._result_queue.get() - if status == "error": - e, tb_str = payload - raise RuntimeError(f"Remote init failed:\n{tb_str}") - - def stop(self, timeout: int = 10, cancel_futures: bool = True): - if self._stopped is True: - return - self._stopped = True - self._command_queue.put(None) - self._process.join(timeout=timeout) - self._command_queue.close() - self._result_queue.close() - # Fail any remaining futures - if cancel_futures: - with self._futures_lock: - for fut in self._futures.values(): - if not fut.done(): - fut.set_exception(RuntimeError("Actor stopped before completion.")) - self._futures.clear() - self._task_status[Status.RUNNING] = 0 - - def _invoke(self, method_name, *args, **kwargs): - if self._stopped is True: - raise RuntimeError("Cannot invoke methods on a stopped actor.") - future = Future() - request_id = str(uuid.uuid4()) - with self._futures_lock: - self._futures[request_id] = future - self._command_queue.put((request_id, method_name, args, kwargs)) - self._num_submitted += 1 - self._task_status[Status.PENDING] += 1 - if self._process.is_alive(): - self._task_status[Status.RUNNING] = 1 - return future - - def submitted(self) -> int: - return self._num_submitted - - def pending(self) -> int: - return self._task_status[Status.PENDING] - - def running(self) -> int: - return self._task_status[Status.RUNNING] - - def succeeded(self) -> int: - return self._task_status[Status.SUCCEEDED] - - def failed(self) -> int: - return self._task_status[Status.FAILED] - - def __getattr__(self, name): - # Instead of returning a direct callable, we return a RemoteMethod wrapper - return RemoteMethod(self, name, self._cls_name) - - def __del__(self): - try: - if not self._stopped and self._process.is_alive(): - self.stop() - except Exception: - pass - - -class RemoteMethod: - """ - A wrapper object returned by ActorProxy.__getattr__. - To call the method remotely, use .remote(*args, **kwargs). - """ - - def __init__(self, proxy, method_name, cls_name): - self._proxy = proxy - self._method_name = method_name - self._cls_name = cls_name - - def remote(self, *args, **kwargs): - return self._proxy._invoke(self._method_name, *args, **kwargs) - - def options(self, *args, **kwargs): - warnings.warn( - f'The process-based Actor "{self._cls_name}" cannot use .options(); this call will be ignored.' - ) - return self - - -""" -Note: By default we use a `mp_context="fork"` for Actor creation. -Process creation is much slower under spawn than forking. For example: -- On a MacOS machine, Actor creation time is 20 milliseconds (forking) vs 7 seconds (spawn). -- On a Linux machine, Actor creation time is 20 milliseconds (forking) vs 17 seconds (spawn). - -However, forking comes with caveats which are not present in spawn: -1. Copy-on-Write Memory Behavior: -On Unix-like systems (including MacOS), forked processes share the same memory pages as the parent initially. -These pages are not immediately copied; instead, they are marked copy-on-write. -This means: -- No immediate bulk copy: Your large data structures (like Pandas DataFrames) do not get physically copied into memory -right away. -- Copies on modification: If either the parent or the child modifies a shared page, only then is that page actually -copied. Thus, if the child process reads from large data structures without writing to them, the overhead remains -relatively low. But if it modifies them, the memory cost could jump significantly. - -2. Potential Resource and Concurrency Issues: -Forking a process that already has multiple threads, open file descriptors, or other system resources can lead to -subtle bugs. Some libraries, particularly those relying on threading or certain system calls, may not be “fork-safe.” -Common issues include: -- Thread State: The child process starts with a copy of the parent’s memory but only one thread running (the one that -called fork). Any locks or conditions held by threads in the parent at the time of fork can lead to deadlocks or -inconsistent states. -- External Resources: Network sockets, open database connections, or other system resources may not be safe to use in -the child after fork without an exec. They might appear duplicated but can behave unexpectedly or lead to errors if -not reinitialized. -- Library Incompatibilities: Some libraries are not tested or guaranteed to work correctly in forked children. They -might rely on internal threading, which can break post-fork. -""" -_DEFAULT_ACTOR_PROCESS_CREATION_METHOD: Literal["fork", "spawn"] = "fork" - - -class Actor: - @classmethod - def remote( - cls, - *args, - mp_context: Literal["fork", "spawn"] = _DEFAULT_ACTOR_PROCESS_CREATION_METHOD, - **kwargs, - ): - return ActorProxy( - cls, - init_args=args, - init_kwargs=kwargs, - mp_context=mp_context, - ) - - @classmethod - def options(cls, *args, **kwargs): - warnings.warn( - f'The process-based Actor "{cls.__name__}" cannot use .options(); this call will be ignored.' - ) - return cls - - -def actor(cls, mp_context: Literal["fork", "spawn"] = _DEFAULT_ACTOR_PROCESS_CREATION_METHOD): - """ - Class decorator that transforms a regular class into an actor-enabled class. - The decorated class gains a .remote(*args, **kwargs) class method that - returns an ActorProxy running in a separate process. - """ - - def remote(*args, **kwargs): - return ActorProxy( - cls, - init_args=args, - init_kwargs=kwargs, - mp_context=mp_context, - ) - - def options(cls, *args, **kwargs): - warnings.warn( - f'The process-based Actor "{cls.__name__}" cannot use .options(); this call will be ignored.' - ) - return cls - - cls.remote = remote - cls.options = options - return cls - - -@actor -class TaskActor: - """ - A generic actor that can run an arbitrary callable passed to it. - We'll send (func, args, kwargs) as serialized objects and it will run them. - """ - - def __init__(self): - pass - - def run_callable(self, func_bytes, args, kwargs): - func = cloudpickle.loads(func_bytes) - return func(*args, **kwargs) - - -class ActorPoolExecutor(Executor): - """ - A simple ActorPoolExecutor that mimics the ProcessPoolExecutor interface, - but uses a pool of TaskActor instances for parallel execution. - """ - - def __init__( - self, - max_workers: Optional[int] = None, - *, - load_balancing_strategy: LoadBalancingStrategy = LoadBalancingStrategy.ROUND_ROBIN, - ): - if max_workers is None: - max_workers = mp.cpu_count() - 1 - self._actors: List[ActorProxy] = [TaskActor.remote() for _ in range(max_workers)] - self._actor_index = 0 - self._max_workers = max_workers - self._load_balancing_strategy: LoadBalancingStrategy = LoadBalancingStrategy(load_balancing_strategy) - self._shutdown_lock = threading.Lock() - self._futures = [] - self._shutdown = False - - def submit(self, fn, *args, **kwargs): - with self._shutdown_lock: - if self._shutdown: - raise RuntimeError("Cannot submit tasks after shutdown") - - func_bytes = cloudpickle.dumps(fn) - if self._load_balancing_strategy is LoadBalancingStrategy.ROUND_ROBIN: - actor = self._actors[self._actor_index] - self._actor_index = (self._actor_index + 1) % self._max_workers - elif self._load_balancing_strategy is LoadBalancingStrategy.RANDOM: - actor = random.choice(self._actors) - elif self._load_balancing_strategy is LoadBalancingStrategy.LEAST_USED: - actor = sorted( - [(_actor, _actor.pending()) for _actor in self._actors], - key=lambda x: x[1], - )[0] - elif self._load_balancing_strategy is LoadBalancingStrategy.UNUSED: - actor = sorted( - [(_actor, _actor.running()) for _actor in self._actors], - key=lambda x: x[1], - )[0] - else: - raise NotImplementedError(f"Unsupported load_balancing_strategy: {self._load_balancing_strategy}") - future = actor.run_callable.remote(func_bytes, args, kwargs) - self._remove_completed_futures() - self._futures.append(future) - return future - - def _remove_completed_futures(self): - self._futures = [fut for fut in self._futures if not fut.done()] - - def shutdown(self, wait: bool = True, *, cancel_futures: bool = True) -> None: - with self._shutdown_lock: - if self._shutdown: - return - self._shutdown = True - - # If wait=True, wait for all futures to complete - if wait: - for fut in self._futures: - fut.result() # blocks until future is done or raises - self._remove_completed_futures() - # Stop all actors - for actor in self._actors: - actor.stop(cancel_futures=cancel_futures) - - def map(self, fn, *iterables, timeout=None, chunksize=1): - if chunksize != 1: - raise NotImplementedError("chunksize other than 1 is not implemented") - - inputs = zip(*iterables) - futures = [self.submit(fn, *args) for args in inputs] - - # Yield results in order - for fut in futures: - yield fut.result(timeout=timeout) - - -_GLOBAL_PROCESS_POOL_EXECUTOR = None -_GLOBAL_PROCESS_POOL_EXECUTOR_MAX_WORKERS: int = max(1, min(32, mp.cpu_count() - 1)) - - -def run_parallel( - fn, - *args, - executor: Optional[Union[ProcessPoolExecutor, ActorPoolExecutor]] = None, - **kwargs, -): - global _GLOBAL_PROCESS_POOL_EXECUTOR - if _GLOBAL_PROCESS_POOL_EXECUTOR is None: - _GLOBAL_PROCESS_POOL_EXECUTOR = ActorPoolExecutor( - max_workers=_GLOBAL_PROCESS_POOL_EXECUTOR_MAX_WORKERS - ) - if executor is None: - executor: ActorPoolExecutor = _GLOBAL_PROCESS_POOL_EXECUTOR - try: - # print(f'Running {fn_str(fn)} using {Parallelize.threads} with max_workers={executor._max_workers}') - return executor.submit(fn, *args, **kwargs) ## return a future - except BrokenProcessPool as e: - if executor is _GLOBAL_PROCESS_POOL_EXECUTOR: - executor = ActorPoolExecutor(max_workers=_GLOBAL_PROCESS_POOL_EXECUTOR_MAX_WORKERS) - del _GLOBAL_PROCESS_POOL_EXECUTOR - _GLOBAL_PROCESS_POOL_EXECUTOR = executor - return executor.submit(fn, *args, **kwargs) ## return a future - raise e diff --git a/src/fmcore/util/concurrency/_ray.py b/src/fmcore/util/concurrency/_ray.py deleted file mode 100644 index 20a27e6..0000000 --- a/src/fmcore/util/concurrency/_ray.py +++ /dev/null @@ -1,367 +0,0 @@ -import asyncio -import math -import threading -import time -from concurrent.futures import ThreadPoolExecutor -from concurrent.futures._base import Executor -from contextlib import contextmanager -from math import inf -from typing import * - -from pydantic import Extra, confloat, conint - -from fmcore.util.language import ( - Alias, - Parameters, - ProgressBar, - String, - UserEnteredParameters, - as_list, -) -from fmcore.util.language._import import ( - _IS_DASK_INSTALLED, - _IS_RAY_INSTALLED, - _check_is_ray_installed, -) - -from ._utils import ( - _RAY_ACCUMULATE_ITEM_WAIT, - _RAY_ACCUMULATE_ITER_WAIT, - get_result, - is_done, - wait, -) - -RayRuntimeEnv = dict -RequestCounter = "RequestCounter" -if _IS_RAY_INSTALLED: - import ray - from ray.runtime_env import RuntimeEnv as RayRuntimeEnv - - @ray.remote(num_cpus=1) - def _run_parallel_ray_executor(fn, *args, **kwargs): - return fn(*args, **kwargs) - - @ray.remote - class RequestCounter: - def __init__(self): - self.pending_requests: int = 0 - self.last_started: float = -1 - self.last_completed: float = -1 - - def started_request(self): - self.pending_requests += 1 - self.last_started: time.time() - - def completed_request(self): - self.pending_requests -= 1 - self.last_completed: time.time() - - def num_pending_requests(self) -> int: - return self.pending_requests - - def last_started_timestamp(self) -> float: - return self.last_started - - def last_completed_timestamp(self) -> float: - return self.last_completed - - -def _ray_asyncio_start_event_loop(loop): - asyncio.set_event_loop(loop) - loop.run_forever() - - -class RayPoolExecutor(Executor, Parameters): - """ - An executor that limits the number of concurrent Ray tasks by maintaining a pool of running tasks. - Unlike ThreadPoolExecutor which pre-allocates threads, this executor dynamically manages Ray tasks - using asyncio to control concurrency. - - Example usage: - >>> executor = RayPoolExecutor(max_workers=4) ## Must have ray installed - >>> future = executor.submit( - my_function, - arg1, - arg2, - num_cpus=2 ## Allocate 2 CPUs for this task - ) - >>> result = ray.get(future) ## Wait for and retrieve the result - - Attributes: - max_workers: Maximum number of concurrent Ray tasks. Pass float('inf') for unlimited tasks. - iter_wait: Time to wait between iterations when checking task completion. - item_wait: Time to wait between checking individual tasks. - """ - - max_workers: Union[int, Literal[inf]] - iter_wait: float = _RAY_ACCUMULATE_ITER_WAIT - item_wait: float = _RAY_ACCUMULATE_ITEM_WAIT - _asyncio_event_loop: Optional = None - _asyncio_event_loop_thread: Optional = None - _submission_executor: Optional[ThreadPoolExecutor] = None - _running_tasks: Dict[str, Any] = {} ## Maps task_uid to Ray ObjectRef - _latest_submit: Optional[int] = None - - def _set_asyncio(self): - """ - Lazily initializes the asyncio event loop and its thread. This is done on-demand to avoid - creating resources when the executor is not used with a worker limit. - """ - # Create a new loop and a thread running this loop - if self._asyncio_event_loop is None: - self._asyncio_event_loop = asyncio.new_event_loop() - # print(f'Started _asyncio_event_loop') - if self._asyncio_event_loop_thread is None: - self._asyncio_event_loop_thread = threading.Thread( - target=_ray_asyncio_start_event_loop, - args=(self._asyncio_event_loop,), - ) - self._asyncio_event_loop_thread.start() - # print(f'Started _asyncio_event_loop_thread') - - def submit( - self, - fn: Callable, - *args, - scheduling_strategy: str = "SPREAD", - num_cpus: int = 1, - num_gpus: int = 0, - max_retries: int = 0, - retry_exceptions: Union[List, bool] = True, - **kwargs, - ): - """ - Submits a function for execution using Ray. When max_workers is infinite, tasks are submitted - directly to Ray. Otherwise, uses asyncio to limit concurrent tasks. - - Args: - fn: Function to execute as a task - scheduling_strategy: Ray's scheduling strategy ("SPREAD" distributes tasks evenly across nodes). - num_cpus: Number of CPUs required per task - num_gpus: Number of GPUs required per task - max_retries: Number of times to retry failed tasks - retry_exceptions: Which exceptions should trigger retries - *args, **kwargs: Arguments passed to fn - - Returns: - If max_workers is inf: Ray ObjectRef - Otherwise: asyncio.Future that resolves to a Ray ObjectRef - """ - # print(f'Running {fn_str(fn)} using {Parallelize.ray} with num_cpus={num_cpus}, num_gpus={num_gpus}') - _check_is_ray_installed() - - def _submit_task(): - return _run_parallel_ray_executor.options( - scheduling_strategy=scheduling_strategy, - num_cpus=num_cpus, - num_gpus=num_gpus, - max_retries=max_retries, - retry_exceptions=retry_exceptions, - ).remote(fn, *args, **kwargs) - - _task_uid = str(time.time_ns()) - - if self.max_workers == inf: - return _submit_task() ## Submit to Ray directly - self._set_asyncio() - ## Create a coroutine (i.e. Future), but do not actually start executing it. - coroutine = self._ray_run_fn_async( - submit_task=_submit_task, - task_uid=_task_uid, - ) - - ## Schedule the coroutine to execute on the event loop (which is running on thread _asyncio_event_loop). - fut = asyncio.run_coroutine_threadsafe(coroutine, self._asyncio_event_loop) - return fut - - async def _ray_run_fn_async( - self, - submit_task: Callable, - task_uid: str, - ): - """ - Coroutine that manages task submission while respecting max_workers limit. - Waits for task slots to become available by checking completion of existing tasks. - - Example of how tasks are managed: - If max_workers=2 and 2 tasks are running: - 1. New task arrives, waits in while loop - 2. Loop checks existing tasks, finds completed task - 3. Completed task is removed, new task starts - 4. Process repeats for subsequent tasks - - Args: - submit_task: Callback that creates and submits the Ray task - task_uid: Unique identifier for tracking this task - """ - # self._running_tasks[task_uid] = None - ## Wait until we have capacity to run another task: - while len(self._running_tasks) >= self.max_workers: - ## Polling step: remove completed tasks until we have capacity: - for _task_uid in sorted(self._running_tasks.keys()): - if is_done(self._running_tasks[_task_uid]): - self._running_tasks.pop(_task_uid, None) ## Task has completed, forget about it - if len(self._running_tasks) < self.max_workers: - break ## Move onto next step to submit the task. - time.sleep(self.item_wait) - if len(self._running_tasks) < self.max_workers: - break ## Break the outer loop to submit the task. - ## There is not enough capacity, keep waiting: - time.sleep(self.iter_wait) - - ## Now that we have capacity, submit the task and track it: - fut = submit_task() - self._running_tasks[task_uid] = fut - # print(f'Started {task_uid}. Num running: {len(self._running_tasks)}') - - # ## Cleanup any completed tasks: - # for k in list(self._running_tasks.keys()): - # if is_done(self._running_tasks[k]): - # self._running_tasks.pop(k, None) - # time.sleep(self.item_wait) - return fut - - -def run_parallel_ray( - fn, - *args, - scheduling_strategy: str = "SPREAD", - num_cpus: int = 1, - num_gpus: int = 0, - max_retries: int = 0, - retry_exceptions: Union[List, bool] = True, - executor: Optional[RayPoolExecutor] = None, - **kwargs, -): - _check_is_ray_installed() - # print(f'Running {fn_str(fn)} using {Parallelize.ray} with num_cpus={num_cpus}, num_gpus={num_gpus}') - if executor is not None: - assert isinstance(executor, RayPoolExecutor) - return executor.submit( - fn, - *args, - scheduling_strategy=scheduling_strategy, - num_cpus=num_cpus, - num_gpus=num_gpus, - max_retries=max_retries, - retry_exceptions=retry_exceptions, - **kwargs, - ) - else: - return _run_parallel_ray_executor.options( - scheduling_strategy=scheduling_strategy, - num_cpus=num_cpus, - num_gpus=num_gpus, - max_retries=max_retries, - retry_exceptions=retry_exceptions, - ).remote(fn, *args, **kwargs) - - -## Ref: https://docs.ray.io/en/latest/data/dask-on-ray.html#callbacks -@contextmanager -def RayDaskPersistWaitCallback(): ## Dummy contextmanager for cases when ray or dask is not installed. - yield - - -if _IS_RAY_INSTALLED and _IS_DASK_INSTALLED: - import ray - from ray.util.dask import RayDaskCallback - - class RayDaskPersistWaitCallback(RayDaskCallback): - ## Callback to wait for computation to complete when .persist() is called with block=True - def _ray_postsubmit_all(self, object_refs, dsk): - wait(object_refs) - - -def max_num_resource_actors( - model_num_resources: Union[conint(ge=0), confloat(ge=0.0, lt=1.0)], - ray_num_resources: int, -) -> Union[int, float]: - ## Returns number of models possible, restricted by a particular resource; takes into account - ## fractional resource requirements. - ## Note: all resource-requirements are either 0, a float between 0 and 1, or an integer above 1. - if model_num_resources == 0: - return math.inf - elif 0 < model_num_resources < 1: - ## E.g. when a model needs <1 GPU, multiple models can occupy the same GPU. - max_num_models_per_resource: int = math.floor(1 / model_num_resources) - return ray_num_resources * max_num_models_per_resource - else: - ## E.g. when a model needs >1 GPU, it must be the only model occupying that GPU. - return math.floor(ray_num_resources / model_num_resources) - - -class RayInitConfig(UserEnteredParameters): - class Config(UserEnteredParameters.Config): - extra = Extra.allow - - ## Default values: - address: str = "auto" - temp_dir: Optional[str] = None - include_dashboard: bool = False - runtime_env: RayRuntimeEnv = {} - - -RayActorComposite = "RayActorComposite" - - -class RayActorComposite(Parameters): - actor_id: str - actor: Any - request_counter: Any - - def kill(self): - get_result(ray.kill(self.actor), wait=_RAY_ACCUMULATE_ITER_WAIT) - get_result(ray.kill(self.request_counter), wait=_RAY_ACCUMULATE_ITER_WAIT) - actor: ray.actor.ActorHandle = self.actor - request_counter: ray.actor.ActorHandle = self.request_counter - del actor - del request_counter - - @classmethod - def create_actors( - cls, - actor_factory: Callable, - *, - num_actors: int, - request_counter_num_cpus: float = 0.1, - request_counter_max_concurrency: int = 1000, - **kwargs, - ) -> List[RayActorComposite]: - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - actors_progress_bar: ProgressBar = ProgressBar.of( - progress_bar, - total=num_actors, - desc="Creating Ray actors", - unit="actors", - ) - actor_ids: List[str] = as_list(String.random_name(num_actors)) - actor_composites: List[RayActorComposite] = [] - for actor_i, actor_id in zip(range(num_actors), actor_ids): - request_counter: ray.actor.ActorHandle = RequestCounter.options( - num_cpus=request_counter_num_cpus, - max_concurrency=request_counter_max_concurrency, - ).remote() - actor: ray.actor.ActorHandle = actor_factory( - request_counter=request_counter, - actor_i=actor_i, - actor_id=actor_id, - ) - actor_composites.append( - RayActorComposite( - actor_id=actor_id, - actor=actor, - request_counter=request_counter, - ) - ) - actors_progress_bar.update(1) - time.sleep(0.100) - if len(actor_composites) != num_actors: - msg: str = f"Creation of {num_actors - len(actor_composites)} actors failed" - actors_progress_bar.failed(msg) - raise ValueError(msg) - else: - msg: str = f"Created {num_actors} actors" - actors_progress_bar.success(msg) - return actor_composites diff --git a/src/fmcore/util/concurrency/_threads.py b/src/fmcore/util/concurrency/_threads.py deleted file mode 100644 index a6c2240..0000000 --- a/src/fmcore/util/concurrency/_threads.py +++ /dev/null @@ -1,298 +0,0 @@ -"""A collection of concurrency utilities to augment the Python language:""" - -import ctypes -import logging -import multiprocessing as mp -import time -from concurrent.futures import ThreadPoolExecutor -from concurrent.futures._base import Future -from concurrent.futures.thread import BrokenThreadPool -from math import inf -from threading import Lock, Semaphore -from typing import * - - -class ThreadKilledSystemException(BaseException): - """Custom exception for killing threads.""" - - pass - - -class ThreadKilledSystemExceptionFilter(logging.Filter): - def filter(self, record): - if record.exc_info: - exc_type = record.exc_info[0] - if exc_type.__name__ == "ThreadKilledSystemException": - return False - return True - - -def suppress_ThreadKilledSystemException(): - for _logger_module in ["concurrent.futures", "ipykernel", "ipykernel.ipykernel"]: - _logger = logging.getLogger(_logger_module) - _filter_exists: bool = False - for _filter in _logger.filters: - if _filter.__class__.__name__ == "ThreadKilledSystemExceptionFilter": - _filter_exists: bool = True - # print(f'{_filter.__class__.__name__} exists in {_logger_module} filters') - break - if not _filter_exists: - _logger.addFilter(ThreadKilledSystemExceptionFilter()) - # print(f'{ThreadKilledSystemExceptionFilter} added to {_logger_module} filters') - - -def kill_thread(tid: int): - """ - Forces termination of a thread by injecting a ThreadKilledSystemException into it. - This is a last-resort mechanism that should only be used when normal thread - termination methods have failed. - - Technical Implementation: - Uses the CPython C API (via ctypes) to inject an exception into the target thread's - execution context. When the exception is raised, it will terminate the thread's - execution at its next Python instruction. - - Example usage: - >>> def long_running_task(): - while True: - time.sleep(1) ## Simulate work - - >>> thread = threading.Thread(target=long_running_task) - >>> thread.start() - >>> thread_id = thread.ident - >>> kill_thread(thread_id) ## Thread will terminate on next instruction - - Intended usage: - 1. When performing concurrent/parallel tasks that may need to be cancelled after submission to a ThreadPoolExecutor: - Example: Cancelling a task in an interactive Jupyter session: - >>> prompt_template = "Who is the head of state in: {country}" - >>> countries = ['USA', 'UK', 'India', 'China', 'Russia', ... ] ## Assume a large list - >>> prompts = [prompt_template.format(country=country) for country in countries] - >>> def call_llm(prompt) -> str: - return call_gpt(prompt) - >>> ## Create a ThreadPoolExecutor: - executor = ThreadPoolExecutor(max_workers=10) - >>> ## Submit tasks to ThreadPoolExecutor: - for gpt_generated_text in accumulate_iter([ - run_concurrent(call_llm, prompt) - for prompt in prompt - ]): ## Waits for results as they complete and prints (may be out-of-order): - print(gpt_generated_text) - >>> ## Now, suppose while printing the results, we realise the prompt is not good. - >>> ## We want to cancel the pending tasks by pressing "stop" in Jupyter notebook. - >>> ## By default, this will raise a KeyboardInterrupt, but WILL NOT stop the running tasks! - >>> ## Instead, we can use kill_thread to stop the tasks: - >>> executor.shutdown(wait=False) ## Cancels pending items - >>> for tid in worker_ids(executor): - kill_thread(tid) ## After calling this, you can still submit - >>> executor.shutdown(wait=False) ## After calling this, you cannot submit - - Warning! Critical Thread-Safety expecations may be violated: - 1. Resource Cleanup: - - Locks, file handles, and network connections may remain locked/open - - Database transactions might be left uncommitted - Example: If thread holds a lock when killed: - >>> lock.acquire() - >>> kill_thread(tid) ## Lock remains acquired forever - 2. Data Integrity: - - Shared data structures may be left in inconsistent states - Example: During a multi-step update: - >>> data['step1'] = new_value - >>> kill_thread(tid) ## 'step2' never happens, data is corrupt - 3. System Stability: - - Python runtime isn't designed for forced thread termination - - May cause memory leaks or interpreter instability - Example: During critical system operations: - >>> sys.modules['critical_module'] = new_module - >>> kill_thread(tid) ## System left in unknown state - - Args: - tid: Thread ID (integer) of the thread to terminate. Obtain this from - threading.Thread.ident - - Raises: - ValueError: If tid is invalid - TypeError: If exctype is not derived from BaseException - SystemError: If thread termination fails - """ - exctype: Type[BaseException] = ThreadKilledSystemException - if not issubclass(exctype, BaseException): - raise TypeError("Only types derived from BaseException are allowed") - res = ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), ctypes.py_object(exctype)) - logging.debug(f"...killed thread ID: {tid}") - if res == 0: - raise ValueError(f"Invalid thread ID: {tid}") - elif res != 1: - # If it returns a number greater than one, you're in trouble, - # and you should call it again with exc=NULL to revert the effect - ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), None) - raise SystemError("PyThreadState_SetAsyncExc failed") - - -def concurrent(max_workers: int = 10, max_calls_per_second: float = inf): - """ - Decorator which runs function calls concurrently via multithreading. - When decorating an IO-bound function with @concurrent(MAX_THREADS), and then invoking the function - N times in a loop, it will run min(MAX_THREADS, N) invocations of the function concurrently. - For example, if your function calls another service, and you must invoke the function N times, decorating with - @concurrent(3) ensures that you only have 3 concurrent function-calls at a time, meaning you only make - 3 concurrent requests at a time. This reduces the number of connections you are making to the downstream service. - As this uses multi-threading and not multi-processing, it is suitable for IO-heavy functions, not CPU-heavy. - - Each call to the decorated function returns a future. Calling .result() on that future will return the value. - Generally, you should call the decorated function N times in a loop, and store the futures in a list/dict. Then, - call .result() on all the futures, saving the results in a new list/dict. Each .result() call is synchronous, so the - order of items is maintained between the lists. When doing this, at most min(MAX_THREADS, N) function calls will be - running concurrently. - Note that if the function calls throws an exception, then calling .result() will raise the exception in the - orchestrating code. If multiple function calls raise an exception, the one on which .result() was called first will - throw the exception to the orchestrating code. You should add try-catch logic inside your decorated function to - ensure exceptions are handled. - Note that decorated function `a` can call another decorated function `b` without issues; it is upto the function A - to determine whether to call .result() on the futures it gets from `b`, or return the future to its own invoker. - - `max_calls_per_second` controls the rate at which we can call the function. This is particularly important for - functions which execute quickly: e.g. suppose the decorated function calls a downstream service, and we allow a - maximum concurrency of 5. If each function call takes 100ms, then we end up making 1000/100*5 = 50 calls to the - downstream service each second. We thus should pass `max_calls_per_second` to restrict this to a smaller value. - - :param max_workers: the max number of threads which can be running the function at one time. This is thus - them max concurrency factor. - :param max_calls_per_second: controls the rate at which we can call the function. - :return: N/A, this is a decorator. - """ - - ## Refs: - ## 1. ThreadPoolExecutor: docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Executor.submit - ## 2. Decorators: www.datacamp.com/community/tutorials/decorators-python - ## 3. Semaphores: www.geeksforgeeks.org/synchronization-by-using-semaphore-in-python/ - ## 4. Overall code: https://gist.github.com/gregburek/1441055#gistcomment-1294264 - def decorator(function): - ## Each decorated function gets its own executor and semaphore. These are defined at the function-level, so - ## if you write two decorated functions `def say_hi` and `def say_bye`, they each gets a separate executor and - ## semaphore. Then, if you invoke `say_hi` 30 times and `say_bye` 20 times, all 30 calls to say_hi will use the - ## same executor and semaphore, and all 20 `say_bye` will use a different executor and semaphore. The value of - ## `max_workers` will determine how many function calls actually run concurrently, e.g. if say_hi has - ## max_workers=5, then the 30 calls will run 5 at a time (this is enforced by the semaphore). - executor = ThreadPoolExecutor(max_workers=max_workers) - semaphore = Semaphore(max_workers) - - ## The minimum time between invocations. - min_time_interval_between_calls = 1 / max_calls_per_second - ## This only stores a single value, but it must be a list (mutable) for Python's function scoping to work. - time_last_called = [0.0] - - def wrapper(*args, **kwargs) -> Future: - semaphore.acquire() - time_elapsed_since_last_called = time.time() - time_last_called[0] - time_to_wait_before_next_call = max( - 0.0, min_time_interval_between_calls - time_elapsed_since_last_called - ) - time.sleep(time_to_wait_before_next_call) - - def run_function(*args, **kwargs): - try: - result = function(*args, **kwargs) - finally: - semaphore.release() ## If the function call throws an exception, release the semaphore. - return result - - time_last_called[0] = time.time() - return executor.submit(run_function, *args, **kwargs) ## return a future - - return wrapper - - return decorator - - -class RestrictedConcurrencyThreadPoolExecutor(ThreadPoolExecutor): - """ - This executor restricts concurrency (max active threads) and, optionally, rate (max calls per second). - It is similar in functionality to the @concurrent decorator, but implemented at the executor level. - """ - - def __init__( - self, - max_workers: Optional[int] = None, - *args, - max_calls_per_second: float = float("inf"), - **kwargs, - ): - if max_workers is None: - max_workers: int = min(32, (mp.cpu_count() or 1) + 4) - if not isinstance(max_workers, int) or (max_workers < 1): - raise ValueError("Expected `max_workers`to be a non-negative integer.") - kwargs["max_workers"] = max_workers - super().__init__(*args, **kwargs) - self._semaphore = Semaphore(max_workers) - self._max_calls_per_second = max_calls_per_second - - # If we have an infinite rate, don't enforce a delay - self._min_time_interval_between_calls = 1 / self._max_calls_per_second - - # Tracks the last time a call was started (not finished, just started) - self._time_last_called = 0.0 - self._lock = Lock() # Protects access to _time_last_called - - def submit(self, fn, *args, **kwargs): - # Enforce concurrency limit - self._semaphore.acquire() - - # Rate limiting logic: Before starting a new call, ensure we wait long enough if needed - if self._min_time_interval_between_calls > 0.0: - with self._lock: - time_elapsed_since_last_called = time.time() - self._time_last_called - time_to_wait = max( - 0.0, self._min_time_interval_between_calls - time_elapsed_since_last_called - ) - - # Wait the required time - if time_to_wait > 0: - time.sleep(time_to_wait) - - # Update the last-called time after the wait - with self._lock: - self._time_last_called = time.time() - else: - # No rate limiting, just update the last-called time - with self._lock: - self._time_last_called = time.time() - - future = super().submit(fn, *args, **kwargs) - # When the task completes, release the semaphore to allow another task to start - future.add_done_callback(lambda _: self._semaphore.release()) - return future - - -_GLOBAL_THREAD_POOL_EXECUTOR = None -_GLOBAL_THREAD_POOL_EXECUTOR_MAX_WORKERS: int = 16 - - -def run_concurrent( - fn, - *args, - executor: Optional[ThreadPoolExecutor] = None, - **kwargs, -): - global _GLOBAL_THREAD_POOL_EXECUTOR - if _GLOBAL_THREAD_POOL_EXECUTOR is None: - _GLOBAL_THREAD_POOL_EXECUTOR = RestrictedConcurrencyThreadPoolExecutor( - max_workers=_GLOBAL_THREAD_POOL_EXECUTOR_MAX_WORKERS - ) - if executor is None: - executor: ThreadPoolExecutor = _GLOBAL_THREAD_POOL_EXECUTOR - try: - # logging.debug(f'Running {fn_str(fn)} using {Parallelize.threads} with max_workers={executor._max_workers}') - return executor.submit(fn, *args, **kwargs) ## return a future - except BrokenThreadPool as e: - if executor is _GLOBAL_THREAD_POOL_EXECUTOR: - executor = RestrictedConcurrencyThreadPoolExecutor( - max_workers=_GLOBAL_THREAD_POOL_EXECUTOR_MAX_WORKERS - ) - del _GLOBAL_THREAD_POOL_EXECUTOR - _GLOBAL_THREAD_POOL_EXECUTOR = executor - return executor.submit(fn, *args, **kwargs) ## return a future - raise e - - -suppress_ThreadKilledSystemException() diff --git a/src/fmcore/util/concurrency/_utils.py b/src/fmcore/util/concurrency/_utils.py deleted file mode 100644 index f97dd83..0000000 --- a/src/fmcore/util/concurrency/_utils.py +++ /dev/null @@ -1,466 +0,0 @@ -import time -from concurrent.futures import wait as wait_future -from concurrent.futures._base import Future -from typing import * - -import numpy as np - -from fmcore.constants.DataProcessingConstants import Status -from fmcore.util.language import Alias, AutoEnum, ProgressBar, String, auto, first_item, get_default, type_str -from fmcore.util.language._import import _IS_RAY_INSTALLED - -if _IS_RAY_INSTALLED: - import ray - -_LOCAL_ACCUMULATE_ITEM_WAIT: float = 1e-3 ## 1ms -_RAY_ACCUMULATE_ITEM_WAIT: float = 10e-3 ## 10ms - -_LOCAL_ACCUMULATE_ITER_WAIT: float = 100e-3 ## 100ms -_RAY_ACCUMULATE_ITER_WAIT: float = 1000e-3 ## 1000ms - - -class LoadBalancingStrategy(AutoEnum): - ROUND_ROBIN = auto() - LEAST_USED = auto() - UNUSED = auto() - RANDOM = auto() - - -def get_result( - x, - *, - wait: float = 1.0, ## 1000 ms -) -> Optional[Any]: - if isinstance(x, Future): - return get_result(x.result(), wait=wait) - if _IS_RAY_INSTALLED and isinstance(x, ray.ObjectRef): - from ray.exceptions import GetTimeoutError - - while True: - try: - return ray.get(x, timeout=wait) - except GetTimeoutError: - pass - return x - - -def is_future(x) -> bool: - if isinstance(x, Future): - return True - elif _IS_RAY_INSTALLED and isinstance(x, ray.ObjectRef): - return True - return False - - -def is_running(x) -> bool: - if isinstance(x, Future): - return x.running() ## It might be scheduled but not running. - if _IS_RAY_INSTALLED and isinstance(x, ray.ObjectRef): - return not is_done(x) - return False - - -def is_done(x) -> bool: - if isinstance(x, Future): - return x.done() - if _IS_RAY_INSTALLED and isinstance(x, ray.ObjectRef): - ## Ref: docs.ray.io/en/latest/ray-core/tasks.html#waiting-for-partial-results - done, not_done = ray.wait([x], timeout=0) ## Immediately check if done. - return len(done) > 0 and len(not_done) == 0 - return True - - -def is_successful(x, *, pending_returns_false: bool = False) -> Optional[bool]: - if not is_done(x): - if pending_returns_false: - return False - else: - return None - try: - get_result(x) - return True - except Exception: - return False - - -def is_failed(x, *, pending_returns_false: bool = False) -> Optional[bool]: - if not is_done(x): - if pending_returns_false: - return False - else: - return None - try: - get_result(x) - return False - except Exception: - return True - - -def get_status(x) -> Status: - if is_running(x): - return Status.RUNNING - if not is_done(x): ## Not running and not done, thus pending i.e. scheduled - return Status.PENDING - ## The future is done: - if is_successful(x): - return Status.SUCCEEDED - if is_failed(x): - return Status.FAILED - - -def wait_if_future(x): - if isinstance(x, Future): - wait_future([x]) - elif _IS_RAY_INSTALLED and isinstance(x, ray.ObjectRef): - ray.wait([x]) - - -def retry( - fn, - *args, - retries: int = 5, - wait: float = 10.0, - jitter: float = 0.5, - silent: bool = True, - return_num_failures: bool = False, - **kwargs, -) -> Union[Any, Tuple[Any, int]]: - """ - Retries a function call a certain number of times, waiting between calls (with a jitter in the wait period). - :param fn: the function to call. - :param retries: max number of times to try. If set to 0, will not retry. - :param wait: average wait period between retries - :param jitter: limit of jitter (+-). E.g. jitter=0.1 means we will wait for a random time period in the range - (0.9 * wait, 1.1 * wait) seconds. - :param silent: whether to print an error message on each retry. - :param kwargs: keyword arguments forwarded to the function. - :param return_num_failures: whether to return the number of times failed. - :return: the function's return value if any call succeeds. If return_num_failures is set, returns this as the second result. - :raise: RuntimeError if all `retries` calls fail. - """ - assert isinstance(retries, int) and 0 <= retries - assert isinstance(wait, (int, float)) and 0 <= wait - assert isinstance(jitter, (int, float)) and 0 <= jitter <= 1 - wait: float = float(wait) - latest_exception = None - num_failures: int = 0 - for retry_num in range(retries + 1): - try: - out = fn(*args, **kwargs) - if return_num_failures: - return out, num_failures - else: - return out - except Exception as e: - num_failures += 1 - latest_exception = String.format_exception_msg(e) - if not silent: - print( - f"Function call failed with the following exception (attempts: {retry_num + 1}):\n{latest_exception}" - ) - if retry_num < (retries - 1): - print(f"Retrying {retries - (retry_num + 1)} more time(s)...\n") - time.sleep(np.random.uniform(wait - wait * jitter, wait + wait * jitter)) - raise RuntimeError( - f"Function call failed {retries + 1} time(s).\nLatest exception:\n{latest_exception}\n" - ) - - -def wait( - futures: Union[Tuple, List, Set, Dict, Any], - *, - check_done: bool = True, - item_wait: float = 0.1, ## 100 ms - iter_wait: float = 1.0, ## 1000 ms - **kwargs, -) -> NoReturn: - """Join operation on a single future or a collection of futures.""" - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - - if isinstance(futures, (list, tuple, set, np.ndarray)): - futures: List[Any] = list(futures) - completed_futures: List[bool] = [is_done(fut) if check_done else False for fut in futures] - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(futures), - initial=sum(completed_futures), - desc="Waiting", - prefer_kwargs=False, - unit="item", - ) - while not all(completed_futures): - for i, fut in enumerate(futures): - if completed_futures[i] is False: - completed_futures[i] = is_done(fut) - if completed_futures[i] is True: - pbar.update(1) - time.sleep(item_wait) - time.sleep(iter_wait) - pbar.success("Done", close=False) - elif isinstance(futures, dict): - futures: List[Tuple[Any, Any]] = list(futures.items()) - completed_futures: List[bool] = [ - (is_done(fut_k) and is_done(fut_v)) if check_done else False for fut_k, fut_v in futures - ] - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(futures), - initial=sum(completed_futures), - desc="Waiting", - prefer_kwargs=False, - unit="item", - ) - while not all(completed_futures): - for i, (fut_k, fut_v) in enumerate(futures): - if completed_futures[i] is False: - completed_futures[i] = is_done(fut_k) and is_done(fut_v) - if completed_futures[i] is True: - pbar.update(1) - time.sleep(item_wait) - time.sleep(iter_wait) - pbar.success("Done", close=False) - else: - wait_if_future(futures) - - -def accumulate( - futures: Union[Tuple, List, Set, Dict, Any], - *, - check_done: bool = True, - item_wait: Optional[float] = None, - iter_wait: Optional[float] = None, - succeeded_only: bool = False, - **kwargs, -) -> Union[List, Tuple, Set, Dict, Any]: - """ - Description: - Recursively collects results from nested futures, supporting both concurrent.futures.Future and ray.ObjectRef. - Unlike the standard .result() calls which block until completion, this function provides progress tracking - and supports nested structures of futures. - - Args: - futures: Single future or collection of futures to accumulate - check_done: Whether to verify completion before collecting. Set False to force immediate collection - item_wait: Time to wait between checking individual futures (auto-selected based on future type) - iter_wait: Time to wait between iterations over all futures (auto-selected based on future type) - succeeded_only: If True, only return results from successfully completed futures - **kwargs: Additional arguments like configuration for "progress_bar" - - Returns: - Collection of results matching the structure of input futures - - Technical Implementation: - 1. For lists/tuples/sets: Recursively accumulates each future while maintaining original container type - 2. For dicts: Accumulates both keys and values, supporting futures in either position - 3. Uses different wait times for Ray vs concurrent.futures to account for their performance characteristics - - Example usage (with a list of futures from ThreadPoolExecutor; similar for ProcessPoolExecutor): - >>> executor = ThreadPoolExecutor(max_workers=4) - >>> futures = [ - executor.submit(time.sleep, i) - for i in range(5) - ] ## Create 5 futures that sleep for 0,1,2,3,4 seconds - >>> results = accumulate( - futures, - progress_bar=dict(desc="Processing") - ) ## Shows progress bar while collecting results - >>> print(results) ## [None, None, None, None, None] - - Example usage (with Ray): - >>> @ray.remote - def slow_add(a, b): - time.sleep(random.random()) ## Simulate varying compute times - return a + b - >>> futures = [ - slow_add.remote(i, i) - for i in range(10) - ] ## Submit 10 parallel additions - >>> results = accumulate( - futures, - progress_bar=dict(desc="Adding numbers") - ) ## Shows progress while collecting - >>> print(results) ## [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] - - Example (usage with futures in dict): - >>> futures_dict = { - k: executor.submit(float, k) ## Converts int to float - for k in range(3) - } ## Values are futures, but both keys and values could be futures - >>> results = accumulate(futures_dict) - >>> print(results) ## {'0': 0.0, '1': 1.0, '2': 2.0} - """ - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - if isinstance(futures, (list, set, tuple)) and len(futures) > 0: - if isinstance(first_item(futures), Future): - item_wait: float = get_default(item_wait, _LOCAL_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _LOCAL_ACCUMULATE_ITER_WAIT) - else: - item_wait: float = get_default(item_wait, _RAY_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _RAY_ACCUMULATE_ITER_WAIT) - if succeeded_only: - return type(futures)( - [ - accumulate(fut, progress_bar=False, check_done=check_done, succeeded_only=succeeded_only) - for fut in futures - if is_successful(fut) - ] - ) - completed_futures: List[bool] = [is_done(fut) if check_done else False for fut in futures] - accumulated_futures: List = [ - accumulate(fut, progress_bar=False, check_done=check_done) if future_is_complete else fut - for future_is_complete, fut in zip(completed_futures, futures) - ] - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(futures), - initial=sum(completed_futures), - desc="Collecting", - prefer_kwargs=False, - unit="item", - ) - while not all(completed_futures): - for i, fut in enumerate(accumulated_futures): - if completed_futures[i] is False: - completed_futures[i] = is_done(fut) - if completed_futures[i] is True: - accumulated_futures[i] = accumulate(fut, progress_bar=False, check_done=check_done) - pbar.update(1) - time.sleep(item_wait) - time.sleep(iter_wait) - pbar.success("Done", close=False) - return type(futures)(accumulated_futures) ## Convert - elif isinstance(futures, dict) and len(futures) > 0: - if isinstance(first_item(futures)[0], Future) or isinstance(first_item(futures)[1], Future): - item_wait: float = get_default(item_wait, _LOCAL_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _LOCAL_ACCUMULATE_ITER_WAIT) - else: - item_wait: float = get_default(item_wait, _RAY_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _RAY_ACCUMULATE_ITER_WAIT) - futures: List[Tuple] = list(futures.items()) - if succeeded_only: - return dict( - [ - ( - accumulate( - fut_k, progress_bar=False, check_done=check_done, succeeded_only=succeeded_only - ), - accumulate( - fut_v, progress_bar=False, check_done=check_done, succeeded_only=succeeded_only - ), - ) - for fut_k, fut_v in futures - if (is_successful(fut_k) and is_successful(fut_v)) - ] - ) - completed_futures: List[bool] = [ - (is_done(fut_k) and is_done(fut_v)) if check_done else False for fut_k, fut_v in futures - ] - accumulated_futures: List[Tuple] = [ - ( - accumulate(fut_k, progress_bar=False, check_done=check_done), - accumulate(fut_v, progress_bar=False, check_done=check_done), - ) - if future_is_complete - else (fut_k, fut_v) - for future_is_complete, (fut_k, fut_v) in zip(completed_futures, futures) - ] - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(futures), - initial=sum(completed_futures), - desc="Collecting", - prefer_kwargs=False, - unit="item", - ) - while not all(completed_futures): - for i, (fut_k, fut_v) in enumerate(accumulated_futures): - if completed_futures[i] is False: - completed_futures[i] = is_done(fut_k) and is_done(fut_v) - if completed_futures[i] is True: - accumulated_futures[i] = ( - accumulate(fut_k, progress_bar=False, check_done=check_done), - accumulate(fut_v, progress_bar=False, check_done=check_done), - ) - pbar.update(1) - time.sleep(item_wait) - time.sleep(iter_wait) - pbar.success("Done", close=False) - return dict(accumulated_futures) - else: - return get_result(futures) - - -def accumulate_iter( - futures: Union[Tuple, List, Set, Dict], - *, - item_wait: Optional[float] = None, - iter_wait: Optional[float] = None, - allow_partial_results: bool = False, - **kwargs, -): - """ - Here we iteratively accumulate and yield completed futures as they have completed. - This might return them out-of-order. - """ - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=len(futures), - desc="Iterating", - prefer_kwargs=False, - unit="item", - ) - if isinstance(futures, (list, set, tuple)) and len(futures) > 0: - if isinstance(first_item(futures), Future): - item_wait: float = get_default(item_wait, _LOCAL_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _LOCAL_ACCUMULATE_ITER_WAIT) - else: - item_wait: float = get_default(item_wait, _RAY_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _RAY_ACCUMULATE_ITER_WAIT) - ## Copy as list: - futures: List = [fut for fut in futures] - yielded_futures: List[bool] = [False for fut in futures] - while not all(yielded_futures): - for i, fut in enumerate(futures): - if yielded_futures[i] is False and is_done(fut): - try: - yielded_futures[i] = True - pbar.update(1) - yield get_result(fut) - time.sleep(item_wait) - except Exception as e: - if not allow_partial_results: - pbar.failed(close=False) - raise e - yield fut - time.sleep(iter_wait) - pbar.success("Done", close=False) - elif isinstance(futures, dict) and len(futures) > 0: - ## Copy as list: - futures: List[Tuple[Any, Any]] = [(fut_k, fut_v) for fut_k, fut_v in futures.items()] - if isinstance(first_item(futures)[0], Future) or isinstance(first_item(futures)[1], Future): - item_wait: float = get_default(item_wait, _LOCAL_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _LOCAL_ACCUMULATE_ITER_WAIT) - else: - item_wait: float = get_default(item_wait, _RAY_ACCUMULATE_ITEM_WAIT) - iter_wait: float = get_default(iter_wait, _RAY_ACCUMULATE_ITER_WAIT) - yielded_futures: List[bool] = [False for fut_k, fut_v in futures] - while not all(yielded_futures): - for i, (fut_k, fut_v) in enumerate(futures): - if yielded_futures[i] is False and (is_done(fut_k) and is_done(fut_v)): - try: - yielded_futures[i] = True - pbar.update(1) - yield (get_result(fut_k), get_result(fut_v)) - pbar.update(1) - time.sleep(item_wait) - except Exception as e: - if not allow_partial_results: - pbar.failed(close=False) - raise e - yield (fut_k, fut_v) - time.sleep(iter_wait) - pbar.success("Done", close=False) - else: - if not isinstance(futures, (list, set, tuple, dict)): - raise NotImplementedError(f"Cannot iteratively collect from object of type: {type_str(futures)}.") diff --git a/src/fmcore/util/environment.py b/src/fmcore/util/environment.py deleted file mode 100644 index 9526397..0000000 --- a/src/fmcore/util/environment.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -from typing import * - -from fmcore.util.language import String, Utility, get_default - - -class EnvUtil(Utility): - ## KEY - PROCESSING_JOB_NAME: ClassVar[str] = "PROCESSING_JOB_NAME" - SNS_TOPIC: ClassVar[str] = "SNS_TOPIC_ARN" - SNS_TOPIC_REGION: ClassVar[str] = "SNS_TOPIC_REGION" - DDB_TABLE_REGION: ClassVar[str] = "DDB_TABLE_REGION" - DDB_TABLE_NAME: ClassVar[str] = "DDB_TABLE_NAME" - LOG_LEVEL: ClassVar[str] = "LOG_LEVEL" - CHIME_WEBHOOK_URL: ClassVar[str] = "CHIME_WEBHOOK_URL" - CUDA_VISIBLE_DEVICES: ClassVar[str] = "CUDA_VISIBLE_DEVICES" - - @classmethod - def var_exists(cls, env_var_key) -> bool: - env_var_key: str = String.assert_not_empty_and_strip(env_var_key) - if os.environ.get(env_var_key) is not None: - return True - else: - return False - - @classmethod - def get_var(cls, env_var_key: str, check_cases: bool = True) -> Optional[str]: - env_var_key: str = String.assert_not_empty_and_strip(env_var_key) - if cls.var_exists(env_var_key): - return os.environ.get(env_var_key) - if check_cases and cls.var_exists(env_var_key.upper()): - return os.environ.get(env_var_key.upper()) - if check_cases and cls.var_exists(env_var_key.lower()): - return os.environ.get(env_var_key.lower()) - return None - - @classmethod - def cuda_visible_devices(cls) -> List[int]: - return [ - int(cuda_device_id) - for cuda_device_id in get_default(cls.get_var(cls.CUDA_VISIBLE_DEVICES), "").split(",") - ] - - @classmethod - def num_gpus(cls, provider: str = "cuda") -> int: - if provider == "cuda": - if get_default(cls.get_var(cls.CUDA_VISIBLE_DEVICES), "") == "": - return 0 - return len(cls.cuda_visible_devices()) - raise NotImplementedError(f'Unsupported GPU provider: "{provider}"') diff --git a/src/fmcore/util/filesystem.py b/src/fmcore/util/filesystem.py deleted file mode 100644 index 51a7d27..0000000 --- a/src/fmcore/util/filesystem.py +++ /dev/null @@ -1,454 +0,0 @@ -import errno -import glob -import io -import json -import os -import pathlib -import pickle -import shutil -import time -from typing import * - -import yaml - -from fmcore.util.language import String, as_list, remove_values - - -class FileSystemUtil: - def __init__(self): - raise TypeError(f'Cannot instantiate utility class "{str(self.__class__)}"') - - @classmethod - def exists(cls, path: str) -> bool: - return pathlib.Path(path).exists() - - @classmethod - def dir_exists(cls, path: str) -> bool: - try: - path: str = cls.expand_dir(path) - return pathlib.Path(path).is_dir() - except OSError as e: - if e.errno == errno.ENAMETOOLONG: - return False - raise e - - @classmethod - def dirs_exist(cls, paths: List[str], ignore_files: bool = True) -> bool: - for path in paths: - if ignore_files and cls.file_exists(path): - continue - if not cls.dir_exists(path): - return False - return True - - @classmethod - def is_path_valid_dir(cls, path: str) -> bool: - path: str = cls.expand_dir(path) - path: str = String.assert_not_empty_and_strip( - path, error_message=f'Following path is not a valid local directory: "{path}"' - ) - return path.endswith(os.path.sep) or cls.dir_exists(path) - - @classmethod - def file_exists(cls, path: str) -> bool: - try: - path: str = cls.expand_dir(path) - return pathlib.Path(path).is_file() - except OSError as e: - if e.errno == errno.ENAMETOOLONG: - return False - raise e - - @classmethod - def check_file_exists(cls, path: str): - if cls.file_exists(path) is False: - raise FileNotFoundError(f'Could not find file at location "{path}"') - - @classmethod - def check_dir_exists(cls, path: str): - if cls.dir_exists(path) is False: - raise FileNotFoundError(f'Could not find dir at location "{path}"') - - @classmethod - def files_exist(cls, paths: List[str], ignore_dirs: bool = True) -> bool: - for path in paths: - if ignore_dirs and cls.dir_exists(path): - continue - if not cls.file_exists(path): - return False - return True - - @classmethod - def get_dir(cls, path: str) -> str: - """ - Returns the directory of the path. If the path is an existing dir, returns the input. - :param path: input file or directory path. - :return: The dir of the passed path. Always ends in '/'. - """ - path: str = String.assert_not_empty_and_strip(path) - path: str = cls.expand_dir(path) - if not cls.dir_exists(path): ## Works for both /home/seldon and /home/seldon/ - path: str = os.path.dirname(path) - return cls.construct_nested_dir_path(path) - - @classmethod - def mkdir_if_does_not_exist(cls, path: str, *, raise_error: bool = False) -> bool: - try: - path: str = cls.expand_dir(path) - dir_path: str = cls.get_dir(path) - if not cls.is_writable(dir_path): - raise OSError(f'Insufficient permissions to create directory at path "{path}"') - os.makedirs(dir_path, exist_ok=True) - if not cls.dir_exists(dir_path): - raise OSError(f'Could not create directory at path "{path}"') - return True - except Exception as e: - if raise_error: - raise e - return False - - @classmethod - def expand_dir(cls, path: Union[str, pathlib.Path]) -> str: - is_dir: bool = False - if isinstance(path, pathlib.Path): - path: str = str(path) - if pathlib.Path(path).is_dir() or path.endswith(os.path.sep): - is_dir: bool = True - path: str = str(path) - if path.startswith("~"): - path: str = os.path.expanduser(path) - path: str = os.path.abspath(path) - if is_dir: - path: str = path if path.endswith(os.path.sep) else path + os.path.sep - return path - - @classmethod - def is_writable(cls, path: str) -> bool: - """ - Checkes whether the current user has sufficient permissions to write files in the passed directory. - Backs off to checking parent files until it hits the root (this handles cases where the path may not exist yet). - Ref: modified from https://stackoverflow.com/a/34102855 - :param path: path to check directory. If file path is passed, will check in that file's directory. - :return: True if the current user has write permissions. - """ - ## Parent directory of the passed path. - path: str = cls.expand_dir(path) - dir: str = cls.get_dir(path) - if cls.dir_exists(dir): - return os.access(dir, os.W_OK) - dir_parents: Sequence = pathlib.Path(dir).parents - for i in range(len(dir_parents)): - if cls.dir_exists(dir_parents[i]): - return os.access(dir_parents[i], os.W_OK) - return False - - @classmethod - def list_files_in_dir(cls, *args, **kwargs) -> List[str]: - return cls.list(*args, **kwargs) - - @classmethod - def list( - cls, - path: str, - *, - file_glob: str = String.DOUBLE_ASTERISK, - ignored_files: Union[str, List[str]] = None, - recursive: bool = False, - only_files: bool = False, - only_subdirs: bool = False, - **kwargs, - ) -> List[str]: - if ignored_files is None: - ignored_files = [] - ignored_files: List[str] = as_list(ignored_files) - if not isinstance(file_glob, str): - raise ValueError(f"`file_glob` must be a string; found {type(file_glob)} with value {file_glob}") - if only_files and only_subdirs: - raise ValueError( - "Cannot set both `only_files` and `only_subdir` to True; at most one must be set." - ) - path: str = cls.expand_dir(path) - fpaths: List[str] = glob.glob(os.path.join(path, file_glob), recursive=recursive) - file_names_map: Dict[str, str] = {file_path: os.path.basename(file_path) for file_path in fpaths} - file_names_map = remove_values(file_names_map, ignored_files) - fpaths: List[str] = sorted(list(file_names_map.keys())) - if only_files: - fpaths: List[str] = [file_path for file_path in fpaths if cls.file_exists(file_path)] - if only_subdirs: - fpaths: List[str] = [file_path for file_path in fpaths if cls.dir_exists(file_path)] - return fpaths if len(fpaths) > 0 else [] - - @classmethod - def list_first_file_in_dir( - cls, path: str, file_glob=String.ASTERISK, ignored_files=None - ) -> Optional[str]: - path: str = cls.expand_dir(path) - file_paths: List[str] = cls.list_files_in_dir(path, file_glob=file_glob, ignored_files=ignored_files) - return file_paths[0] if len(file_paths) > 0 else None - - @classmethod - def list_only_file_in_dir(cls, path: str, file_glob=String.ASTERISK, ignored_files=None) -> Optional[str]: - path: str = cls.expand_dir(path) - if cls.file_exists(path): - return path ## Is actually a file - file_paths: List[str] = cls.list_files_in_dir(path, file_glob=file_glob, ignored_files=ignored_files) - if len(file_paths) == 0: - return None - if len(file_paths) > 1: - raise FileNotFoundError("Multiple matching files are present in the directory") - return file_paths[0] - - @classmethod - def get_file_size( - cls, - path: Union[List[str], str], - unit: Optional[str] = None, - decimals: int = 3, - ) -> Union[float, str]: - fpaths: List[str] = as_list(path) - size_in_bytes: int = int(sum([pathlib.Path(fpath).stat().st_size for fpath in fpaths])) - if unit is not None: - return String.convert_size_from_bytes(size_in_bytes, unit=unit, decimals=decimals) - return String.readable_bytes(size_in_bytes, decimals=decimals) - - @classmethod - def get_time_last_modified(cls, path: str, decimals=3): - path = String.assert_not_empty_and_strip(path) - path: str = cls.expand_dir(path) - assert cls.exists(path), f"Path {path} does not exist." - return round(os.path.getmtime(path), decimals) - - @classmethod - def get_last_modified_time(cls, path: str): - path: str = cls.expand_dir(path) - assert cls.exists(path), f"Path {path} does not exist." - return os.path.getmtime(path) - - @classmethod - def get_seconds_since_last_modified(cls, path: str, decimals=3): - path: str = cls.expand_dir(path) - return round(time.time() - cls.get_last_modified_time(path), decimals) - - @classmethod - def read( - cls, - path: str, - *, - concat: bool = False, - concat_sep: str = "\n", - **kwargs, - ) -> Optional[Union[Dict[str, str], str]]: - if cls.file_exists(path): - return cls.get_file_str(path, **kwargs) - elif cls.dir_exists(path): - out = {fpath: cls.get_file_str(fpath, **kwargs) for fpath in cls.list(path, **kwargs)} - if not concat: - return out - return concat_sep.join([out[fpath] for fpath in sorted(list(out.keys()))]) - raise OSError(f'Path "{path}" is neither an existing file or directory.') - - @classmethod - def get_file_str( - cls, - path: str, - *, - encoding: str = "utf-8", - errors: str = "replace", - raise_error: bool = False, - **kwargs, - ) -> Optional[str]: - path: str = cls.expand_dir(path) - try: - with io.open(path, "r", encoding=encoding, errors=errors) as inp: - file_str = inp.read() - String.assert_not_empty(file_str) - return file_str - except Exception as e: - if raise_error: - raise e - return None - - @classmethod - def get_file_bytes(cls, path: str, *, raise_error: bool = False) -> Optional[bytes]: - path: str = cls.expand_dir(path) - try: - with io.open(path, "rb") as inp: - file_bytes = inp.read() - String.assert_not_empty_bytes(file_bytes) - return file_bytes - except Exception as e: - if raise_error: - raise e - return None - - @classmethod - def get_file_pickle(cls, path: str, *, raise_error: bool = False) -> Optional[Any]: - path: str = cls.expand_dir(path) - try: - with io.open(path, "rb") as inp: - data = pickle.load(inp) - assert data is not None - return data - except Exception as e: - if raise_error: - raise e - return None - - @classmethod - def get_json(cls, path: str, *, raise_error: bool = False): - path: str = cls.expand_dir(path) - try: - with io.open(path, "r") as inp: - return json.load(inp) - except Exception as e: - if raise_error: - raise e - return None - - @classmethod - def get_yaml(cls, path: str, *, raise_error: bool = False): - path: str = cls.expand_dir(path) - try: - with io.open(path, "r") as inp: - return yaml.safe_load(inp) - except Exception as e: - if raise_error: - raise e - return None - - @classmethod - def touch_file( - cls, - path: str, - **kwargs, - ) -> bool: - return cls.put_file_str(path=path, file_str="", **kwargs) - - @classmethod - def put_file_str( - cls, - path: str, - file_str: str, - overwrite: bool = True, - raise_error: bool = True, - ) -> bool: - path: str = cls.expand_dir(path) - if cls.file_exists(path) and overwrite is False: - if raise_error: - raise FileExistsError(f"File already exists at {path}, set overwrite=True to overwrite it.") - return False - try: - with io.open(path, "w") as out: - out.write(file_str) - return True - except Exception as e: - if raise_error: - raise e - return False - - @classmethod - def put_file_pickle( - cls, - path: str, - data: Any, - overwrite: bool = True, - raise_error: bool = True, - ) -> bool: - path: str = cls.expand_dir(path) - if cls.file_exists(path) and overwrite is False: - if raise_error: - raise FileExistsError(f"File already exists at {path}, set overwrite=True to overwrite it.") - return False - try: - with io.open(path, "wb") as out: - pickle.dump(data, out) - return True - except Exception as e: - if raise_error: - raise e - return False - - @classmethod - def rm_file(cls, path: str, *, raise_error: bool = True): - path: str = cls.expand_dir(path) - if cls.file_exists(path): - try: - os.remove(path) - except Exception as e: - if raise_error: - raise e - return False - - @classmethod - def copy_dir(cls, source: str, destination: str, *, mkdir: bool = True, raise_error: bool = True) -> bool: - """Copies one dir to another dir, potentially overwriting files in the destination dir.""" - source: str = cls.expand_dir(source) - if not cls.dir_exists(source): - if not raise_error: - return False - raise OSError(f'Could not find source directory at path "{source}"') - destination: str = cls.expand_dir(destination) - if not cls.is_path_valid_dir(destination): - if not raise_error: - return False - raise OSError(f'Destination is not a valid directory path: "{destination}"') - if mkdir: - if not cls.mkdir_if_does_not_exist(destination, raise_error=False): - if not raise_error: - return False - raise OSError(f'Cannot create destination directory at path: "{destination}"') - shutil.copytree(source, destination, dirs_exist_ok=True) - return True - - @classmethod - def construct_path_in_dir(cls, path: str, name: str, is_dir: bool, **kwargs) -> str: - if not path.endswith(os.path.sep): - path += os.path.sep - if is_dir is False: - out: str = cls.construct_file_path_in_dir(path, name, **kwargs) - else: - out: str = cls.construct_subdir_path_in_dir(path, name) - return out - - @classmethod - def construct_file_path_in_dir(cls, path: str, name: str, file_ending: Optional[str] = None) -> str: - """ - If the path is a dir, uses the inputs to construct a file path. - If path is a file, returns the path unchanged. - :param path: path to dir (or file) on filesystem. - :param name: name of the file. - :param file_ending: (optional) a string of the file ending. - :return: file path string. - """ - path: str = cls.expand_dir(path) - if cls.is_path_valid_dir(path): - file_name: str = String.assert_not_empty_and_strip(name) - if file_ending is not None: - file_name += String.assert_not_empty_and_strip(file_ending) - return os.path.join(cls.get_dir(path), file_name) - else: - return path - - @classmethod - def construct_subdir_path_in_dir(cls, path: str, name: str) -> str: - """ - Uses the inputs to construct a subdir path. - :param path: path to dir on filesystem. - :param name: name of the subdir. - :return: subdir path string. - """ - path: str = cls.expand_dir(path) - if not cls.is_path_valid_dir(path): - raise ValueError(f'Base dir path "{path}" is not a valid directory.') - name: str = String.assert_not_empty_and_strip(name) - path: str = os.path.join(cls.get_dir(path), name) - if not path.endswith(os.path.sep): - path += os.path.sep - return path - - @classmethod - def construct_nested_dir_path(cls, path: str, *other_paths: Tuple[str]) -> str: - String.assert_not_empty(path) - other_paths = tuple([str(x) for x in other_paths]) - path = os.path.join(path, *other_paths) - return path if path.endswith(os.path.sep) else path + os.path.sep diff --git a/src/fmcore/util/jupyter.py b/src/fmcore/util/jupyter.py deleted file mode 100644 index 7cd0547..0000000 --- a/src/fmcore/util/jupyter.py +++ /dev/null @@ -1,220 +0,0 @@ -import json -import os -import urllib.error -import urllib.request -from itertools import chain -from pathlib import Path, PurePath -from typing import * - -from fmcore.util.language import as_list, not_impl, safe_validate_arguments - -JUPYTER_FILE_ERROR: str = "Can't identify the notebook {}." -JUPYTER_CONN_ERROR: str = ( - "Unable to access server;\n" + "ipynbname requires either no security or token based security." -) - - -class JupyterNotebook: - """Copied from: https://github.com/msm1089/ipynbname/blob/master/ipynbname/__init__.py""" - - @classmethod - def _list_maybe_running_servers(cls, runtime_dir=None) -> Generator[dict, None, None]: - """Iterate over the server info files of running notebook servers.""" - from jupyter_core.paths import jupyter_runtime_dir - - if runtime_dir is None: - runtime_dir = jupyter_runtime_dir() - runtime_dir = Path(runtime_dir) - - if runtime_dir.is_dir(): - # Get notebook configuration files, sorted to check the more recently modified ones first - for file_name in sorted( - chain( - runtime_dir.glob("nbserver-*.json"), # jupyter notebook (or lab 2) - runtime_dir.glob("jpserver-*.json"), # jupyterlab 3 - ), - key=os.path.getmtime, - reverse=True, - ): - try: - yield json.loads(file_name.read_bytes()) - except json.JSONDecodeError: - # Sometimes we encounter empty JSON files. Ignore them. - pass - - @classmethod - def _get_kernel_id( - cls, - ) -> str: - """Returns the kernel ID of the ipykernel.""" - import ipykernel - - connection_file = Path(ipykernel.get_connection_file()).stem - kernel_id = connection_file.split("-", 1)[1] - return kernel_id - - @classmethod - def _get_sessions(cls, srv): - """Given a server, returns sessions, or HTTPError if access is denied. - NOTE: Works only when either there is no security or there is token - based security. An HTTPError is raised if unable to connect to a - server. - """ - try: - qry_str = "" - token = srv["token"] - if token: - qry_str = f"?token={token}" - if not token and "JUPYTERHUB_API_TOKEN" in os.environ: - token = os.environ["JUPYTERHUB_API_TOKEN"] - url = f"{srv['url']}api/sessions{qry_str}" - # Use a timeout in case this is a stale entry. - with urllib.request.urlopen(url, timeout=0.5) as req: - return json.load(req) - except Exception: - raise urllib.error.HTTPError(JUPYTER_CONN_ERROR) - - @classmethod - def _find_nb_path( - cls, - ) -> Union[Tuple[dict, PurePath], Tuple[None, None]]: - from traitlets.config import MultipleInstanceError - - try: - kernel_id = cls._get_kernel_id() - except (MultipleInstanceError, RuntimeError): - return None, None # Could not determine - for srv in cls._list_maybe_running_servers(): - try: - sessions = cls._get_sessions(srv) - for sess in sessions: - if sess["kernel"]["id"] == kernel_id: - return srv, PurePath(sess["notebook"]["path"]) - except Exception: - pass # There may be stale entries in the runtime directory - return None, None - - @classmethod - def name(cls, *, extension: bool = False) -> Optional[str]: - """Returns the short name of the notebook w/o the .ipynb extension, - or raises a FileNotFoundError exception if it cannot be determined. - """ - try: - _, path = cls._find_nb_path() - if path: - if extension: - return path.name - return path.stem - raise FileNotFoundError(JUPYTER_FILE_ERROR.format("name")) - except Exception: - return None - - @classmethod - def path( - cls, - ) -> Optional[str]: - """Returns the absolute path of the notebook, - or raises a FileNotFoundError exception if it cannot be determined. - """ - try: - srv, path = cls._find_nb_path() - if srv and path: - root_dir = Path(srv.get("root_dir") or srv["notebook_dir"]) - return str(root_dir / path) - raise FileNotFoundError(JUPYTER_FILE_ERROR.format("path")) - except Exception: - return None - - @staticmethod - def is_notebook() -> bool: - """Returns True when using JupyterNotebook, False for both IPython and basic python interpreter.""" - return JupyterNotebook.name() is not None - - -def print_md(x): - try: - from IPython.display import Markdown, display - - x = Markdown(x) - except ImportError: - display = print - display(x) - - -def print_math(x): - try: - from IPython.display import Math, display - - x = Math(x) - except ImportError: - display = print - display(x) - - -def display_colors(colors: Union[Set[str], Tuple[str, ...], List[str], str]): - """Displays colors from the given list with their names or codes.""" - # Start the HTML string for the colored divs - html_str: str = "

" - - # Loop through the colors, adding each as a small colored div with a label - for color in as_list(colors): - html_str += f""" -
-
-
{color.lower()}
-
- """ - - # Close the main div - html_str += "
" - - # Display the HTML - try: - from IPython.display import HTML, display - except ImportError: - display = print - HTML = lambda x: str(x) - display(HTML(html_str)) - - -@safe_validate_arguments -def plotsum( - plots_list: Union[List[Tuple[str, Any]], List[Any]], - *, - order: Optional[List[str]] = None, - how: Literal["overlay", "grid"] = "grid", - legend: Literal["first", "last", "none"] = "none", - update_layout: Optional[Dict] = None, - backend: Literal["plotly"] = "plotly", -): - if order is not None: - assert len(plots_list) > 0 - assert len(order) == len(plots_list) - assert len(set(p[0] for p in plots_list)) == len(order) - ordered_plots_list: List[Any] = [] - for order_item in order: - plot_str: Optional = None - for plot_str, plot in plots_list: - if plot_str == order_item: - break - plot_str = None - if plot_str is None: - raise ValueError(f'No plot found with name: "{order_item}"') - ordered_plots_list.append(plot) - plots_list = ordered_plots_list - - plots = None - for plot in plots_list: - if isinstance(plot, tuple): - assert len(plot) == 2 - plot = plot[1] - if plots is None: - plots = plot - else: - if how == "grid": - plots += plot - elif how == "overlay": - plots *= plot - else: - raise not_impl("how", how) - return plots diff --git a/src/fmcore/util/language/__init__.py b/src/fmcore/util/language/__init__.py deleted file mode 100644 index 5845e19..0000000 --- a/src/fmcore/util/language/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from fmcore.util.language._alias import * -from fmcore.util.language._autoenum import * -from fmcore.util.language._function import * -from fmcore.util.language._import import * -from fmcore.util.language._iter import * -from fmcore.util.language._math import * -from fmcore.util.language._pbar import * -from fmcore.util.language._selection import * -from fmcore.util.language._string import * -from fmcore.util.language._structs import * -from fmcore.util.language._testing import * -from fmcore.util.language._typing import * -from fmcore.util.language._utils import * diff --git a/src/fmcore/util/language/_alias.py b/src/fmcore/util/language/_alias.py deleted file mode 100644 index 64d698f..0000000 --- a/src/fmcore/util/language/_alias.py +++ /dev/null @@ -1,374 +0,0 @@ -from typing import * - -from ._utils import Utility, get_default - - -def set_param_from_alias( - params: Dict, - param: str, - alias: Union[Tuple[str, ...], List[str], Set[str], str], - remove_alias: bool = True, - prioritize_aliases: bool = False, - default: Optional[Any] = None, -): - if isinstance(alias, (list, tuple, set)): - alias: List = list(alias) - elif isinstance(alias, str): - alias: str = alias.strip() - assert len(alias) > 0 - alias: List = [alias] - else: - raise NotImplementedError(f"Invalid argument alias of type {type(alias)}") - if prioritize_aliases: - param_names: List = alias + [param] - else: - param_names: List = [param] + alias - if remove_alias: - value: Optional[Any] = get_default( - *[params.pop(param_name, None) for param_name in param_names], default - ) - else: - value: Optional[Any] = get_default( - *[params.get(param_name, None) for param_name in param_names], default - ) - if value is not None: - ## If none are set, use default value: - params[param] = value - - -class _AliasMeta(type): - def __getattr__(cls, attr_name: str) -> Callable: - if attr_name.startswith("get_"): - param_name: str = attr_name.replace("get_", "") - setter_name: str = f"set_{param_name}" - if not hasattr(cls, setter_name): - raise AttributeError( - f"`{attr_name}` is does not have a corresponding setter function `{setter_name}`." - ) - setter: Callable = getattr(cls, setter_name) - - def getter(params: Dict, *args, pop: bool = True, **kwargs): - setter(params, *args, **kwargs) - if pop: - return params.pop(param_name, None) - else: - return params.get(param_name, None) - - return getter - raise AttributeError(f"`{attr_name}` is not an attribute of {cls.__name__}.") - - -class Alias(Utility, metaclass=_AliasMeta): - @classmethod - def set_AlgorithmClass(cls, params: Dict, param: str = "AlgorithmClass", **kwargs): - set_param_from_alias(params, param=param, alias=["algorithm", "AlgorithmClass"], **kwargs) - - @classmethod - def set_retry(cls, params: Dict, param: str = "retry", **kwargs): - set_param_from_alias(params, param=param, alias=["retries", "num_retries", "retry"], **kwargs) - - @classmethod - def set_data_schema(cls, params: Dict, param: str = "data_schema", **kwargs): - set_param_from_alias(params, param=param, alias=["schema", "dataset_schema", "data_schema"], **kwargs) - - @classmethod - def set_data_split(cls, params: Dict, param: str = "data_split", **kwargs): - set_param_from_alias( - params, - param=param, - alias=["dataset_type", "split", "dataset_split", "data_split", "predictions_split"], - **kwargs, - ) - - @classmethod - def set_stream_as(cls, params: Dict, param: str = "stream_as", **kwargs): - set_param_from_alias(params, param=param, alias=["stream_as", "stream_layout", "iter_as"], **kwargs) - - @classmethod - def set_num_rows(cls, params: Dict, param: str = "num_rows", **kwargs): - set_param_from_alias(params, param=param, alias=["batch_size", "nrows", "num_rows"], **kwargs) - - @classmethod - def set_predict_batch_size(cls, params: Dict, param: str = "predict_batch_size", **kwargs): - set_param_from_alias( - params, - param=param, - alias=["predict_batch_size", "eval_batch_size", "nrows", "num_rows", "batch_size"], - **kwargs, - ) - - @classmethod - def set_num_chunks(cls, params: Dict, param: str = "num_chunks", **kwargs): - set_param_from_alias(params, param=param, alias=["num_batches", "nchunks", "num_chunks"], **kwargs) - - @classmethod - def set_shuffle(cls, params: Dict, param: str = "shuffle", **kwargs): - set_param_from_alias(params, param=param, alias=["shuffle"], **kwargs) - - @classmethod - def set_top_k(cls, params: Dict, param: str = "top_k", **kwargs): - set_param_from_alias(params=params, param=param, alias=["k", "top_k"], **kwargs) - - @classmethod - def set_seed(cls, params: Dict, param: str = "seed", **kwargs): - set_param_from_alias(params, param=param, alias=["random_state", "random_seed", "seed"], **kwargs) - - @classmethod - def set_shard_seed(cls, params: Dict, param: str = "shard_seed", **kwargs): - set_param_from_alias( - params, param=param, alias=["shard_random_state", "shard_random_seed", "shard_seed"], **kwargs - ) - - @classmethod - def set_mapper(cls, params: Dict, param: str = "mapper", **kwargs): - set_param_from_alias(params, param=param, alias=["mapping_fn", "mapping", "map", "mapper"], **kwargs) - - @classmethod - def set_map_apply(cls, params: Dict, param: str = "map_apply", **kwargs): - set_param_from_alias( - params, param=param, alias=["mapper_apply", "mapping_apply", "map_apply"], **kwargs - ) - - @classmethod - def set_map_executor(cls, params: Dict, param: str = "map_executor", **kwargs): - set_param_from_alias(params, param=param, alias=["executor", "map_executor"], **kwargs) - - @classmethod - def set_map_failure(cls, params: Dict, param: str = "map_failure", **kwargs): - set_param_from_alias(params, param=param, alias=["mapper_failure", "map_failure"], **kwargs) - - @classmethod - def set_num_workers(cls, params: Dict, param: str = "num_workers", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "num_processes", - "n_processes", - "n_process", - "n_proc", - "n_jobs", - "map_num_workers", - "num_workers", - ], - **kwargs, - ) - - @classmethod - def set_parallelize(cls, params: Dict, param: str = "parallelize", **kwargs): - set_param_from_alias( - params, param=param, alias=["map_parallelize", "parallel", "parallelize"], **kwargs - ) - - @classmethod - def set_shard_rank(cls, params: Dict, param: str = "shard_rank", **kwargs): - set_param_from_alias(params, param=param, alias=["shard_idx", "shard_i", "shard_rank"], **kwargs) - - @classmethod - def set_num_shards(cls, params: Dict, param: str = "num_shards", **kwargs): - set_param_from_alias(params, param=param, alias=["world_size", "num_shards"], **kwargs) - - @classmethod - def set_format(cls, params: Dict, param: str = "format", **kwargs): - set_param_from_alias(params, param=param, alias=["file_format", "format"], **kwargs) - - @classmethod - def set_metrics(cls, params: Dict, param: str = "metrics", **kwargs): - set_param_from_alias( - params, param=param, alias=["metric", "metrics_list", "metric_list", "metrics"], **kwargs - ) - - @classmethod - def set_return_predictions(cls, params: Dict, param: str = "return_predictions", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "return_preds", - "preds", - "predictions", - "return_predictions", - ], - **kwargs, - ) - - @classmethod - def set_predictions_destination(cls, params: Dict, param: str = "predictions_destination", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "preds_destination", - "save_predictions", - "save_preds", - "save_preds_to", - "save_to", - "predictions_destination", - ], - **kwargs, - ) - - @classmethod - def set_tracker(cls, params: Dict, param: str = "tracker", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "experiment_tracker", - "experiment", - "experiment_name", - "trial", - "trial_name", - "tracker", - ], - **kwargs, - ) - - @classmethod - def set_progress_bar(cls, params: Dict, param: str = "progress_bar", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "progress", - "progress_bar", - "pbar", - ], - **kwargs, - ) - - @classmethod - def get_progress_bar(cls, params, *, default_progress_bar: bool = True, **kwargs) -> Optional[Dict]: - Alias.set_progress_bar(params, **kwargs) - progress_bar: Union[Dict, bool] = params.pop("progress_bar", default_progress_bar) - if progress_bar is False: - progress_bar: Optional[Dict] = None - elif progress_bar is True: - progress_bar: Optional[Dict] = dict() - assert progress_bar is None or isinstance(progress_bar, dict) - return progress_bar - - @classmethod - def set_silent(cls, params: Dict, param: str = "silent", **kwargs): - set_param_from_alias(params, param=param, alias=["quiet"], **kwargs) - - @classmethod - def set_model_dir(cls, params: Dict, param: str = "model_dir", **kwargs): - set_param_from_alias( - params, param=param, alias=["load_model", "pretrained_path", "model_dir"], **kwargs - ) - - @classmethod - def set_cache_dir(cls, params: Dict, param: str = "cache_dir", **kwargs): - set_param_from_alias( - params, param=param, alias=["model_cache_dir", "model_cache", "cache_dir"], **kwargs - ) - - @classmethod - def set_cache_timeout(cls, params: Dict, param: str = "cache_timeout", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "cache_timeout_sec", - "cache_timeout_seconds", - "timeout", - "timeout_sec", - "timeout_seconds", - "cache_model_timeout", - "cache_model_timeout_sec", - "cache_model_timeout_seconds", - "model_timeout", - "model_timeout_sec", - "model_timeout_seconds", - "model_cache_timeout", - "model_cache_timeout_sec", - "model_cache_timeout_seconds", - "actor_timeout", - "actor_timeout_sec", - "actor_timeout_seconds", - "keepalive", - "keepalive_timeout", - "keepalive_timeout_sec", - "keepalive_timeout_seconds", - ], - **kwargs, - ) - - @classmethod - def set_custom_definitions(cls, params: Dict, param: str = "custom_definitions", **kwargs): - set_param_from_alias( - params, param=param, alias=["udfs", "custom", "custom_classes", "custom_definitions"], **kwargs - ) - - @classmethod - def set_verbosity(cls, params: Dict, param: str = "verbosity", **kwargs): - set_param_from_alias(params, param=param, alias=["verbose", "verbosity"], **kwargs) - - @classmethod - def set_log_file(cls, params: Dict, param: str = "log_file", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "log", - "file", - "path", - "fpath", - "log_fpath", - ], - **kwargs, - ) - - @classmethod - def set_save_model(cls, params: Dict, param: str = "save_model", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "save", - "save_to", - "model_save", - "model_save_path", - "model_save_dir", - "save_model_path", - "save_model_dir", - "save_model", - ], - **kwargs, - ) - - @classmethod - def set_load_model(cls, params: Dict, param: str = "load_model", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "load", - "load_from", - "model_load", - "model_dir", - "model_load_path", - "model_load_dir", - "load_model_path", - "load_model_dir", - "load_model", - ], - **kwargs, - ) - - @classmethod - def failure_action(cls, params: Dict, param: str = "failure_action", **kwargs): - set_param_from_alias( - params, - param=param, - alias=[ - "error_action", - "on_failure", - "on_error", - "error_behavior", - "failure_behavior", - "failure_action", - ], - **kwargs, - ) diff --git a/src/fmcore/util/language/_autoenum.py b/src/fmcore/util/language/_autoenum.py deleted file mode 100644 index 3215665..0000000 --- a/src/fmcore/util/language/_autoenum.py +++ /dev/null @@ -1,309 +0,0 @@ -from enum import Enum, auto -from typing import * - - -class alias(auto): - def __init__(self, *aliases): - if len(aliases) == 0: - raise ValueError("Cannot have empty alias() call.") - for a in aliases: - if not isinstance(a, str): - raise ValueError( - f"All aliases for must be strings; found alias of type {type(a)} having value: {a}" - ) - self.names = aliases - self.enum_name = None - - def __repr__(self) -> str: - return str(self) - - def __str__(self): - if self.enum_name is not None: - return self.enum_name - return self.alias_repr - - @property - def alias_repr(self) -> str: - return str(f"alias:{list(self.names)}") - - def __setattr__(self, attr_name: str, attr_value: Any): - if attr_name == "value": - ## because alias subclasses auto and does not set value, enum.py:143 will try to set value - self.enum_name = attr_value - else: - super(alias, self).__setattr__(attr_name, attr_value) - - def __getattribute__(self, attr_name: str): - """ - Refer these lines in Python 3.10.9 enum.py: - - class _EnumDict(dict): - ... - def __setitem__(self, key, value): - ... - elif not _is_descriptor(value): - ... - if isinstance(value, auto): - if value.value == _auto_null: - value.value = self._generate_next_value( - key, - 1, - len(self._member_names), - self._last_values[:], - ) - self._auto_called = True - value = value.value - ... - ... - ... - - """ - if attr_name == "value": - if object.__getattribute__(self, "enum_name") is None: - ## Gets _auto_null as alias inherits auto class but does not set `value` class member; refer enum.py:142 - try: - return object.__getattribute__(self, "value") - except Exception: - from enum import _auto_null - - return _auto_null - return self - return object.__getattribute__(self, attr_name) - - -_DEFAULT_REMOVAL_TABLE = str.maketrans( - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "abcdefghijklmnopqrstuvwxyz", - " -_.:;,", ## Will be removed -) - - -class AutoEnum(str, Enum): - """ - Utility class which can be subclassed to create enums using auto() and alias(). - Also provides utility methods for common enum operations. - """ - - def __init__(self, value: Union[str, alias]): - self.aliases: Tuple[str, ...] = tuple() - if isinstance(value, alias): - self.aliases: Tuple[str, ...] = value.names - - @classmethod - def _missing_(cls, enum_value: Any): - ## Ref: https://stackoverflow.com/a/60174274/4900327 - ## This is needed to allow Pydantic to perform case-insensitive conversion to AutoEnum. - return cls.from_str(enum_value=enum_value, raise_error=True) - - def _generate_next_value_(name, start, count, last_values): - return name - - @property - def str(self) -> str: - return self.__str__() - - def __repr__(self): - return self.__str__() - - def __str__(self): - return self.name - - def __hash__(self): - return hash(self.__class__.__name__ + "." + self.name) - - def __eq__(self, other): - return self is other - - def __ne__(self, other): - return self is not other - - def matches(self, enum_value: str) -> bool: - return self is self.from_str(enum_value, raise_error=False) - - @classmethod - def matches_any(cls, enum_value: str) -> bool: - return cls.from_str(enum_value, raise_error=False) is not None - - @classmethod - def does_not_match_any(cls, enum_value: str) -> bool: - return not cls.matches_any(enum_value) - - @classmethod - def display_names(cls, **kwargd) -> str: - return str([enum_value.display_name(**kwargd) for enum_value in list(cls)]) - - def display_name(self, *, sep: str = " ") -> str: - return sep.join( - [ - word.lower() if word.lower() in ("of", "in", "the") else word.capitalize() - for word in str(self).split("_") - ] - ) - - @classmethod - def _initialize_lookup(cls): - if "_value2member_map_normalized_" not in cls.__dict__: ## Caching values for fast retrieval. - cls._value2member_map_normalized_ = {} - - def _set_normalized(e, normalized_e_name): - if normalized_e_name in cls._value2member_map_normalized_: - raise ValueError( - f'Cannot register enum "{e.name}"; ' - f'another enum with the same normalized name "{normalized_e_name}" already exists.' - ) - cls._value2member_map_normalized_[normalized_e_name] = e - - for e in list(cls): - _set_normalized(e, cls._normalize(e.name)) - if len(e.aliases) > 0: - ## Add the alias-repr to the lookup: - _set_normalized(e, cls._normalize(alias(*e.aliases).alias_repr)) - for e_alias in e.aliases: - _set_normalized(e, cls._normalize(e_alias)) - - @classmethod - def from_str(cls, enum_value: str, raise_error: bool = True) -> Optional: - """ - Performs a case-insensitive lookup of the enum value string among the members of the current AutoEnum subclass. - :param enum_value: enum value string - :param raise_error: whether to raise an error if the string is not found in the enum - :return: an enum value which matches the string - :raises: ValueError if raise_error is True and no enum value matches the string - """ - if isinstance(enum_value, cls): - return enum_value - if enum_value is None and raise_error is False: - return None - if not isinstance(enum_value, str) and raise_error is True: - raise ValueError(f"Input should be a string; found type {type(enum_value)}") - cls._initialize_lookup() - enum_obj: Optional[AutoEnum] = cls._value2member_map_normalized_.get(cls._normalize(enum_value)) - if enum_obj is None and raise_error is True: - raise ValueError( - f"Could not find enum with value {repr(enum_value)}; available values are: {list(cls)}." - ) - return enum_obj - - @classmethod - def _normalize(cls, x: str) -> str: - ## Found to be faster than .translate() and re.sub() on Python 3.10.6 - return str(x).translate(_DEFAULT_REMOVAL_TABLE) - - @classmethod - def convert_keys(cls, d: Dict) -> Dict: - """ - Converts string dict keys to the matching members of the current AutoEnum subclass. - Leaves non-string keys untouched. - :param d: dict to transform - :return: dict with matching string keys transformed to enum values - """ - out_dict = {} - for k, v in d.items(): - if isinstance(k, str) and cls.from_str(k, raise_error=False) is not None: - out_dict[cls.from_str(k, raise_error=False)] = v - else: - out_dict[k] = v - return out_dict - - @classmethod - def convert_keys_to_str(cls, d: Dict) -> Dict: - """ - Converts dict keys of the current AutoEnum subclass to the matching string key. - Leaves other keys untouched. - :param d: dict to transform - :return: dict with matching keys of the current AutoEnum transformed to strings. - """ - out_dict = {} - for k, v in d.items(): - if isinstance(k, cls): - out_dict[str(k)] = v - else: - out_dict[k] = v - return out_dict - - @classmethod - def convert_values( - cls, d: Union[Dict, Set, List, Tuple], raise_error: bool = False - ) -> Union[Dict, Set, List, Tuple]: - """ - Converts string values to the matching members of the current AutoEnum subclass. - Leaves non-string values untouched. - :param d: dict, set, list or tuple to transform. - :param raise_error: raise an error if unsupported type. - :return: data structure with matching string values transformed to enum values. - """ - if isinstance(d, dict): - return cls.convert_dict_values(d) - if isinstance(d, list): - return cls.convert_list(d) - if isinstance(d, tuple): - return tuple(cls.convert_list(d)) - if isinstance(d, set): - return cls.convert_set(d) - if raise_error: - raise ValueError(f"Unrecognized data structure of type {type(d)}") - return d - - @classmethod - def convert_dict_values(cls, d: Dict) -> Dict: - """ - Converts string dict values to the matching members of the current AutoEnum subclass. - Leaves non-string values untouched. - :param d: dict to transform - :return: dict with matching string values transformed to enum values - """ - out_dict = {} - for k, v in d.items(): - if isinstance(v, str) and cls.from_str(v, raise_error=False) is not None: - out_dict[k] = cls.from_str(v, raise_error=False) - else: - out_dict[k] = v - return out_dict - - @classmethod - def convert_list(cls, l: Union[List, Tuple]) -> List: - """ - Converts string list itmes to the matching members of the current AutoEnum subclass. - Leaves non-string items untouched. - :param l: list to transform - :return: list with matching string items transformed to enum values - """ - out_list = [] - for item in l: - if isinstance(item, str) and cls.matches_any(item): - out_list.append(cls.from_str(item)) - else: - out_list.append(item) - return out_list - - @classmethod - def convert_set(cls, s: Set) -> Set: - """ - Converts string list itmes to the matching members of the current AutoEnum subclass. - Leaves non-string items untouched. - :param s: set to transform - :return: set with matching string items transformed to enum values - """ - out_set = set() - for item in s: - if isinstance(item, str) and cls.matches_any(item): - out_set.add(cls.from_str(item)) - else: - out_set.add(item) - return out_set - - @classmethod - def convert_values_to_str(cls, d: Dict) -> Dict: - """ - Converts dict values of the current AutoEnum subclass to the matching string value. - Leaves other values untouched. - :param d: dict to transform - :return: dict with matching values of the current AutoEnum transformed to strings. - """ - out_dict = {} - for k, v in d.items(): - if isinstance(v, cls): - out_dict[k] = str(v) - else: - out_dict[k] = v - return out_dict diff --git a/src/fmcore/util/language/_function.py b/src/fmcore/util/language/_function.py deleted file mode 100644 index 1f2da67..0000000 --- a/src/fmcore/util/language/_function.py +++ /dev/null @@ -1,286 +0,0 @@ -import ast -import functools -import inspect -import re -import sys -import types -from ast import literal_eval -from typing import * - -from pydantic import BaseModel, Extra, root_validator - -from ._utils import get_default - - -def fn_str(fn): - return f"{get_fn_spec(fn).resolved_name}" - - -get_current_fn_name = lambda n=0: sys._getframe( - n + 1 -).f_code.co_name ## Ref: https://stackoverflow.com/a/31615605 - - -def is_function(fn: Any) -> bool: - ## Ref: https://stackoverflow.com/a/69823452/4900327 - return isinstance( - fn, - ( - types.FunctionType, - types.MethodType, - types.BuiltinFunctionType, - types.BuiltinMethodType, - types.LambdaType, - functools.partial, - ), - ) - - -def call_str_to_params( - call_str: str, - callable_name_key: str = "name", - max_len: int = 1024, -) -> Tuple[List, Dict]: - """Creates params dict from a call string.""" - if len(call_str) > max_len: ## To prevent this attack: https://stackoverflow.com/a/54763776/4900327 - raise ValueError(f"We cannot parse `call_str` beyond {max_len} chars; found {len(call_str)} chars") - call_str: str = call_str.strip() - if not (call_str.find("(") < call_str.find(")")): - raise ValueError( - f"`call_str` must have one opening paren, followed by one closing paren; " - f'found: `call_str`="{call_str}"' - ) - if not call_str.endswith(")"): - raise ValueError(f'`call_str` must end with a closing paren; found: `call_str`="{call_str}"') - name: str = call_str.split("(")[0] - args: List = [] - kwargs: Dict = {callable_name_key: name} - if call_str != f"{name}()": - ## We have some params: - params_str: str = call_str.replace(f"{name}(", "") - assert params_str.endswith(")") - params_str: str = params_str[:-1] - for param_str in params_str.split(","): - param_str: str = param_str.strip() - if "=" not in param_str: - ## Not an arg-value pair, instead just arg: - args.append(literal_eval(param_str)) - elif len(param_str.split("=")) != 2: - ## Cannot resolve arg-value pair: - raise ValueError(f'Found invalid arg-value pair "{param_str}" in `call_str`="{call_str}"') - else: - k, v = param_str.split("=") - ## No, this is not a security issue. Ref: https://stackoverflow.com/a/7689085/4900327 - if k == name: - raise ValueError(f'Argument name and callable name overlap: "{name}"') - kwargs[k] = literal_eval(v) - return args, kwargs - - -def params_to_call_str(callable_name: str, args: List, kwargs: Dict) -> str: - sep: str = ", " - stringified = [] - if len(args) > 0: - stringified.append(sep.join(args)) - if len(kwargs) > 0: - stringified.append( - sep.join([f"{k}={v}" for k, v in sorted(list(kwargs.items()), key=lambda x: x[0])]) - ) - return f"{callable_name}({sep.join(stringified)})" - - -def wrap_fn_output(fn: Callable, wrapper_fn: Callable) -> Callable: - """ - Ensures a function always returns objects of a particular class. - :param fn: original function to invoke. - :param wrapper_fn: wrapper which takes as input the original function output and returns a different value. - :return: wrapped function object. - """ - - def do(*args, **kwargs): - return wrapper_fn(fn(*args, **kwargs)) - - return do - - -def parsed_fn_source(function) -> Tuple[str, str]: - # Get the source code of the function - # Parse the source code into an AST - parsed_source = ast.parse(inspect.getsource(function)) - # The first element of the body should be the FunctionDef node for the function - function_node: Any = parsed_source.body[0] - # Extract the body of the FunctionDef node - fn_source: str = ast.unparse(function_node) - # Convert the body back to source code strings - fn_body: str = "\n".join([ast.unparse(stmt) for stmt in function_node.body]) - return fn_source, fn_body - - -class FunctionSpec(BaseModel): - name: str - qualname: str - resolved_name: str - source: str - source_body: str - args: Tuple[str, ...] - varargs_name: Optional[str] - kwargs: Tuple[str, ...] - varkwargs_name: Optional[str] - default_args: Dict[str, Any] - default_kwargs: Dict[str, Any] - ignored_args: Tuple[str, ...] = ("self", "cls") - - class Config: - ## Ref for Pydantic mutability: https://pydantic-docs.helpmanual.io/usage/models/#faux-immutability - allow_mutation = False - ## Ref for Extra.forbid: https://pydantic-docs.helpmanual.io/usage/model_config/#options - extra = Extra.forbid - ## Ref for Pydantic private attributes: https://pydantic-docs.helpmanual.io/usage/models/#private-model-attributes - underscore_attrs_are_private = True - ## Validates default values. Ref: https://pydantic-docs.helpmanual.io/usage/model_config/#options - validate_all = True - ## Validates typing by `isinstance` check. Ref: https://pydantic-docs.helpmanual.io/usage/model_config/#options - arbitrary_types_allowed = True - - @root_validator(pre=False) - def _remove_ignored(cls, params: Dict) -> Dict: - ignored_args: Tuple[str, ...] = params["ignored_args"] - params["args"] = tuple(arg_name for arg_name in params["args"] if arg_name not in ignored_args) - params["kwargs"] = tuple(arg_name for arg_name in params["kwargs"] if arg_name not in ignored_args) - params["default_args"] = dict( - (arg_name, default_val) - for arg_name, default_val in params["default_args"].items() - if arg_name not in ignored_args - ) - params["default_kwargs"] = dict( - (arg_name, default_val) - for arg_name, default_val in params["default_kwargs"].items() - if arg_name not in ignored_args - ) - return params - - @property - def args_and_kwargs(self) -> Tuple[str, ...]: - return self.args + self.kwargs - - @property - def default_args_and_kwargs(self) -> Dict[str, Any]: - return {**self.default_args, **self.default_kwargs} - - @property - def required_args_and_kwargs(self) -> Tuple[str, ...]: - default_args_and_kwargs: Dict[str, Any] = self.default_args_and_kwargs - return tuple(arg_name for arg_name in self.args_and_kwargs if arg_name not in default_args_and_kwargs) - - @property - def num_args(self) -> int: - return len(self.args) - - @property - def num_kwargs(self) -> int: - return len(self.kwargs) - - @property - def num_args_and_kwargs(self) -> int: - return self.num_args + self.num_kwargs - - @property - def num_default_args(self) -> int: - return len(self.default_args) - - @property - def num_default_kwargs(self) -> int: - return len(self.default_kwargs) - - @property - def num_default_args_and_kwargs(self) -> int: - return self.num_default_args + self.num_default_kwargs - - @property - def num_required_args_and_kwargs(self) -> int: - return self.num_args_and_kwargs - self.num_default_args_and_kwargs - - -def get_fn_spec(fn: Callable) -> FunctionSpec: - if hasattr(fn, "__wrapped__"): - """ - if a function is wrapped with decorators, unwrap and get all args - eg: pd.read_csv.__code__.co_varnames returns (args, kwargs, arguments) as its wrapped by a decorator @deprecate_nonkeyword_arguments - This line ensures to unwrap all decorators recursively - """ - return get_fn_spec(fn.__wrapped__) - argspec: inspect.FullArgSpec = inspect.getfullargspec(fn) ## Ref: https://stackoverflow.com/a/218709 - - args: Tuple[str, ...] = tuple(get_default(argspec.args, [])) - varargs_name: Optional[str] = argspec.varargs - - kwargs: Tuple[str, ...] = tuple(get_default(argspec.kwonlyargs, [])) - varkwargs_name: Optional[str] = argspec.varkw - - default_args: Tuple[Any, ...] = get_default(argspec.defaults, tuple()) - default_args: Dict[str, Any] = dict( - zip( - argspec.args[-len(default_args) :], ## Get's last len(default_args) values from the args list. - default_args, - ) - ) - default_kwargs: Dict[str, Any] = get_default(argspec.kwonlydefaults, dict()) - - try: - source, source_body = parsed_fn_source(fn) - except IndentationError: - source = inspect.getsource(fn) - source_args_and_body = re.sub(r"^\s*(def\s+\w+\()", "", source, count=1, flags=re.MULTILINE).strip() - source_body: str = source_args_and_body ## Better than nothing. - return FunctionSpec( - name=fn.__name__, - qualname=fn.__qualname__, - resolved_name=fn.__module__ + "." + fn.__qualname__, - source=source, - source_body=source_body, - args=args, - varargs_name=varargs_name, - kwargs=kwargs, - varkwargs_name=varkwargs_name, - default_args=default_args, - default_kwargs=default_kwargs, - ) - - -def get_fn_args( - fn: Union[Callable, FunctionSpec], - *, - ignore: Tuple[str, ...] = ("self", "cls", "kwargs"), - include_args: bool = True, - include_kwargs: bool = True, - include_default: bool = True, -) -> Tuple[str, ...]: - if isinstance(fn, FunctionSpec): - fn_spec: FunctionSpec = fn - else: - fn_spec: FunctionSpec = get_fn_spec(fn) - arg_names: List[str] = list() - if include_args: - arg_names.extend(fn_spec.args) - if include_kwargs: - arg_names.extend(fn_spec.kwargs) - if include_default is False: - ignore: List[str] = ( - list(ignore) + list(fn_spec.default_args.keys()) + list(fn_spec.default_kwargs.keys()) - ) - ignore: Set[str] = set(ignore) - arg_names: Tuple[str, ...] = tuple(a for a in arg_names if a not in ignore) - return arg_names - - -def filter_kwargs(fns: Union[Callable, List[Callable], Tuple[Callable, ...]], **kwargs) -> Dict[str, Any]: - to_keep: Set = set() - if isinstance(fns, (list, set, tuple)): - fns = list(fns) - else: - fns = [fns] - for fn in fns: - fn_args: Tuple[str, ...] = get_fn_args(fn) - to_keep.update(set(fn_args)) - filtered_kwargs: Dict[str, Any] = {k: kwargs[k] for k in kwargs if k in to_keep} - return filtered_kwargs diff --git a/src/fmcore/util/language/_import.py b/src/fmcore/util/language/_import.py deleted file mode 100644 index dc139e8..0000000 --- a/src/fmcore/util/language/_import.py +++ /dev/null @@ -1,131 +0,0 @@ -import types -from contextlib import contextmanager -from typing import * - -from pydantic.typing import Literal - - -@contextmanager -def optional_dependency( - *names: Union[List[str], str], - error: Literal["raise", "warn", "ignore"] = "ignore", - warn_every_time: bool = False, - __WARNED_OPTIONAL_MODULES: Set[str] = set(), ## "Private" argument -) -> Optional[Union[Tuple[types.ModuleType, ...], types.ModuleType]]: - """ - A contextmanager (used with "with") which passes code if optional dependencies are not present. - Ref: https://stackoverflow.com/a/73838546/4900327 - - Parameters - ---------- - names: str or list of strings. - The module name(s) which are optional. - error: str {'raise', 'warn', 'ignore'} - What to do when a dependency is not found in the "with" block: - * raise : Raise an ImportError. - * warn: print a warning (see `warn_every_time`). - * ignore: do nothing. - warn_every_time: bool - Whether to warn every time an import is tried. Only applies when error="warn". - Setting this to True will result in multiple warnings if you try to - import the same library multiple times. - - Usage - ----- - ## 1. Only run code if modules exist, otherwise ignore: - with optional_dependency("pydantic", "sklearn", error="ignore"): - from pydantic import BaseModel - from sklearn.metrics import accuracy_score - class AccuracyCalculator(BaseModel): - decimals: int = 5 - def calculate(self, y_pred: List, y_true: List) -> float: - return round(accuracy_score(y_true, y_pred), self.decimals) - print("Defined AccuracyCalculator in global context") - print("Will be printed finally") ## Always prints - - ## 2. Print warnings with error="warn". Multiple warings are be printed via `warn_every_time=True`. - with optional_dependency("pydantic", "sklearn", error="warn"): - from pydantic import BaseModel - from sklearn.metrics import accuracy_score - class AccuracyCalculator(BaseModel): - decimals: int = 5 - def calculate(self, y_pred: List, y_true: List) -> float: - return round(accuracy_score(y_true, y_pred), self.decimals) - print("Defined AccuracyCalculator in global context") - print("Will be printed finally") ## Always prints - - ## 3. Raise ImportError warnings with error="raise": - with optional_dependency("pydantic", "sklearn", error="raise"): - from pydantic import BaseModel - from sklearn.metrics import accuracy_score - class AccuracyCalculator(BaseModel): - decimals: int = 5 - def calculate(self, y_pred: List, y_true: List) -> float: - return round(accuracy_score(y_true, y_pred), self.decimals) - print("Defined AccuracyCalculator in global context") - print("Will be printed finally") ## Always prints - """ - assert error in {"raise", "warn", "ignore"} - names: Optional[Set[str]] = set(names) - try: - yield None - except (ImportError, ModuleNotFoundError) as e: - missing_module: str = e.name - if len(names) > 0 and missing_module not in names: - raise e ## A non-optional dependency is missing - if error == "raise": - raise e - if error == "warn": - if missing_module not in __WARNED_OPTIONAL_MODULES or warn_every_time is True: - msg = f'Missing optional dependency "{missing_module}". Use pip or conda to install.' - print(f"Warning: {msg}") - __WARNED_OPTIONAL_MODULES.add(missing_module) - - -_IS_RAY_INSTALLED: bool = False - -with optional_dependency("ray"): - import ray - - assert isinstance(ray.ObjectRef, type) - - _IS_RAY_INSTALLED: bool = True - - -def _check_is_ray_installed(): - if not _IS_RAY_INSTALLED: - raise ImportError('Dependency "ray" is not installed.') - - -_IS_DASK_INSTALLED: bool = False - -DaskDataFrame = "DaskDataFrame" -DaskSeries = "DaskSeries" -with optional_dependency("dask"): - from dask.dataframe import DataFrame as DaskDataFrame - from dask.dataframe import Series as DaskSeries - - assert isinstance(DaskDataFrame, type) - assert isinstance(DaskSeries, type) - - _IS_DASK_INSTALLED: bool = True - - -def _check_is_dask_installed(): - if not _IS_DASK_INSTALLED: - raise ImportError('Dependency "dask" is not installed.') - - -_IS_TORCH_INSTALLED: bool = False - -with optional_dependency("torch"): - import torch - - assert isinstance(torch.Tensor, type) - - _IS_TORCH_INSTALLED: bool = True - - -def _check_is_torch_installed(): - if not _IS_TORCH_INSTALLED: - raise ImportError('Dependency "torch" is not installed.') diff --git a/src/fmcore/util/language/_iter.py b/src/fmcore/util/language/_iter.py deleted file mode 100644 index 5afb988..0000000 --- a/src/fmcore/util/language/_iter.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import * - -import numpy as np -import pandas as pd - -from ._alias import Alias -from ._math import is_int_in_floats_clothing -from ._pbar import ProgressBar -from ._structs import is_dict_like, is_set_like - - -def irange(low: Union[float, int], high: Union[float, int], step: Union[float, int] = 1): - """Inclusive range, useful for coding up math notation.""" - if not (isinstance(low, int) or (isinstance(low, float) and low.is_integer())): - raise ValueError(f"low={low} is not a valid integer.") - if not (isinstance(high, int) or (isinstance(high, float) and high.is_integer())): - raise ValueError(f"high={high} is not a valid integer.") - if not (isinstance(step, int) or (isinstance(step, float) and step.is_integer())): - raise ValueError(f"step={step} is not a valid integer.") - return range(int(low), int(high) + 1, int(step)) - - -def frange(low: float, high: float, step: float, *, limits: bool = True) -> List[float]: - """Inclusive range, useful for coding up math notation.""" - assert isinstance(low, (int, float)) and isinstance(high, (int, float)) and isinstance(step, (int, float)) - out: List[float] = [ - x - for x in [round(float(x) / step, 0) * step for x in np.arange(low, high + step, step)] - if low <= x <= high - ] - if limits: - out: List[float] = sorted(set(out).union({low, high})) - return out - - -def is_valid_idx( - l: Union[List, Tuple, np.ndarray, pd.Series, pd.DataFrame], - idx: int, - *, - raise_error: bool = True, -) -> bool: - assert isinstance(l, (list, tuple, np.ndarray, pd.Series, pd.DataFrame)) - assert idx >= 0, "Can only check validity of non-negative indexes" - if len(l) == 0: - if raise_error: - raise ValueError(f"Cannot check validity of index for empty {str(type(l))}") - return False ## No index is valid - return idx in range(0, len(l)) - - -def iter_batches( - struct: Union[List, Tuple, Set, Dict, np.ndarray, pd.Series, pd.DataFrame, int], batch_size: int, **kwargs -) -> Generator[List[Any], None, None]: - assert isinstance(batch_size, int) and batch_size > 0 - progress_bar: Optional[Dict] = Alias.get_progress_bar(kwargs) - if is_int_in_floats_clothing(struct): - struct: List[int] = list(range(int(struct))) - if is_set_like(struct): - struct_type: Type = set - elif is_dict_like(struct): - struct_type: Type = dict - else: - struct_type: Optional[Type] = None - - struct_len: int = len(struct) - pbar: ProgressBar = ProgressBar.of( - progress_bar, - total=struct_len, - initial=0, - desc="Iterating", - prefer_kwargs=False, - unit="item", - ) - try: - if struct_type is not None: - buf: List[Any] = [] - if isinstance(struct, dict): - struct: ItemsView = struct.items() - for x in struct: - buf.append(x) - if len(buf) == batch_size: - out = struct_type(buf) - yield out - pbar.update(len(out)) - buf: List[Any] = [] - if len(buf) > 0: - out = struct_type(buf) - yield out - pbar.update(len(out)) - else: - for i in range(0, struct_len, batch_size): - if isinstance(struct, (pd.Series, pd.DataFrame)): - out = struct.iloc[i : min(i + batch_size, struct_len)] - else: - out = struct[i : min(i + batch_size, struct_len)] - yield out - pbar.update(len(out)) - pbar.success() - except Exception as e: - pbar.failed() - raise e diff --git a/src/fmcore/util/language/_math.py b/src/fmcore/util/language/_math.py deleted file mode 100644 index 52ee683..0000000 --- a/src/fmcore/util/language/_math.py +++ /dev/null @@ -1,117 +0,0 @@ -from typing import * - -import numpy as np -import pandas as pd -from pydantic.typing import Literal - -from ._structs import is_numpy_float_array - -is_even = lambda x: x % 2 == 0 -is_odd = lambda x: x % 2 == 1 - -is_int_in_floats_clothing = lambda x: isinstance(x, int) or (isinstance(x, float) and int(x) == x) - - -def mean(vals): - return sum(vals) / len(vals) - - -def clip(low: Union[int, float], val: Union[int, float], high: Union[int, float]): - assert isinstance(low, (int, float, np.integer, np.float_)) - assert isinstance(high, (int, float, np.integer, np.float_)) - assert isinstance(val, (int, float, np.integer, np.float_)) - assert low <= high - return max(low, min(val, high)) - - -def pad_interval(low: Union[int, float], high: Union[int, float], pad: float) -> Tuple[float, float]: - assert isinstance(low, (int, float, np.integer, np.float_)) - assert isinstance(high, (int, float, np.integer, np.float_)) - assert isinstance(pad, (int, float, np.integer, np.float_)) and 0.0 <= pad <= 1.0 - assert low <= high - width: float = float(high) - float(low) - pad: float = float(pad) - return (low - width * pad, high + width * pad) - - -def rolling_avg(iterator: Union[Iterable, Iterator, Generator]) -> float: - if not hasattr(iterator, "__iter__"): - raise ValueError( - f"Cannot calculate rolling average from an object which is neither an iterator or generator; " - f"found object of type {type(iterator)}." - ) - avg: float = 0 - for i, x in enumerate(iterator): - avg = update_rolling_avg(avg, i, x) - return avg - - -def update_rolling_avg(avg_i: float, i: int, val_i: float) -> float: - """ - Calculates a rolling average. - :param avg_i: the current average. - :param i: the i'th index (starting from 0) - :param val_i: the i'th value. - :return: the updated average. - - Example usage: - n: int = 1_000_000 - l: List[int] = list(range(1, n+1)) ## We know this adds up to n*(n+1)/2, thus the average is (n+1)/2 - avg: float = 0 - for i, x in enumerate(l): - avg = update_rolling_avg(avg, i, x) - assert avg == sum(l)/n == (n+1)/2 - """ - n: int = i + 1 - return ((n - 1) * avg_i + val_i) / n - - -def entropy(probabilities: np.ndarray) -> float: - # Remove zero probabilities to avoid issues with logarithm - if not isinstance(probabilities, np.ndarray): - probabilities: np.ndarray = np.array(probabilities) - assert is_numpy_float_array(probabilities) - prob_sum: float = float(probabilities.sum()) - if abs(1 - prob_sum) > 1e-2: - raise ValueError(f"Probabilities sum to {prob_sum}, should sum to 1") - probabilities = probabilities[probabilities > 0] - # probabilities += 1e-9 - _entropy = float(-np.sum(probabilities * np.log2(probabilities))) - return _entropy - - -def relative_increase( - prev: float, - cur: float, - *, - how: Literal["ratio", "pct"] = "ratio", - decimals: Optional[int] = None, -) -> float: - assert how in {"ratio", "pct"} - increase_frac: float = cur / prev - if how == "ratio": - if decimals is None: - decimals: int = 5 - return round(increase_frac - 1, decimals) - elif how == "pct": - if decimals is None: - decimals: int = 2 - return round(100 * (increase_frac - 1), decimals) - elif how == "bps": - if decimals is None: - decimals: int = 1 - return round(100 * 100 * (increase_frac - 1), decimals) - else: - raise NotImplementedError(f'Unsupported `method`: "{how}"') - - -def to_pct(counts: pd.Series): ## Converts value counts to percentages - _sum = counts.sum() - return pd.DataFrame( - { - "value": counts.index.tolist(), - "count": counts.tolist(), - "pct": counts.apply(lambda x: 100 * x / _sum).tolist(), - "count_str": counts.apply(lambda x: f"{x} of {_sum}").tolist(), - } - ) diff --git a/src/fmcore/util/language/_pbar.py b/src/fmcore/util/language/_pbar.py deleted file mode 100644 index 6922ee6..0000000 --- a/src/fmcore/util/language/_pbar.py +++ /dev/null @@ -1,255 +0,0 @@ -from typing import * - -from pydantic import Extra, conint, root_validator -from pydantic.typing import Literal -from tqdm.auto import tqdm as AutoTqdmProgressBar -from tqdm.autonotebook import tqdm as NotebookTqdmProgressBar -from tqdm.std import tqdm as StdTqdmProgressBar - -from ._alias import set_param_from_alias -from ._function import get_fn_spec -from ._import import _IS_RAY_INSTALLED, optional_dependency -from ._string import String -from ._structs import filter_keys, is_dict_like, is_list_or_set_like, remove_keys -from ._typing import MutableParameters, Parameters - -TqdmProgressBar = Union[AutoTqdmProgressBar, NotebookTqdmProgressBar, StdTqdmProgressBar] - -ProgressBar = "ProgressBar" - - -class ProgressBar(MutableParameters): - pbar: Optional[TqdmProgressBar] = None - style: Literal["auto", "notebook", "std", "ray"] = "auto" - unit: str = "row" - color: str = "#0288d1" ## Bluish - ncols: int = 100 - smoothing: float = 0.15 - total: Optional[int] = None - disable: bool = False - miniters: conint(ge=1) = 1 - _pending_updates: int = 0 - - class Config(Parameters.Config): - extra = Extra.allow - - @root_validator(pre=False) - def _set_params(cls, params: Dict) -> Dict: - set_param_from_alias(params, param="disable", alias=["disabled"]) - pbar: TqdmProgressBar = cls._create_pbar(**remove_keys(params, ["pbar", "color"])) - pbar.color = params["color"] - pbar.refresh() - params["pbar"]: TqdmProgressBar = pbar - return params - - @classmethod - def _create_pbar( - cls, - style: Literal["auto", "notebook", "std", "ray"], - **kwargs, - ) -> TqdmProgressBar: - if style == "auto": - with optional_dependency("ipywidgets"): - kwargs["ncols"]: Optional[int] = None - return AutoTqdmProgressBar(**kwargs) - elif style == "notebook": - with optional_dependency("ipywidgets"): - kwargs["ncols"]: Optional[int] = None - return NotebookTqdmProgressBar(**kwargs) - elif _IS_RAY_INSTALLED and style == "ray": - from ray.experimental import tqdm_ray - - kwargs = filter_keys( - kwargs, - keys=set(get_fn_spec(tqdm_ray.tqdm).args + get_fn_spec(tqdm_ray.tqdm).kwargs), - how="include", - ) - return tqdm_ray.tqdm(**kwargs) - else: - return StdTqdmProgressBar(**kwargs) - - @classmethod - def iter(cls, iterable: Union[Generator, Iterator, List, Tuple, Set, Dict, ItemsView], **kwargs): - if is_list_or_set_like(iterable) or is_dict_like(iterable): - kwargs["total"] = len(iterable) - if is_dict_like(iterable): - iterable: ItemsView = iterable.items() - pbar: ProgressBar = ProgressBar.of(**kwargs) - try: - for item in iterable: - yield item - pbar.update(1) - pbar.success() - except Exception as e: - pbar.failed() - raise e - - @classmethod - def of( - cls, - progress_bar: Optional[Union[ProgressBar, Dict, bool]] = True, - *, - prefer_kwargs: bool = True, - **kwargs, - ) -> ProgressBar: - if isinstance(progress_bar, ProgressBar): - if prefer_kwargs: - if "total" in kwargs: - progress_bar.set_total(kwargs["total"]) - if "initial" in kwargs: - progress_bar.set_n(kwargs["initial"]) - if "desc" in kwargs: - progress_bar.set_description(kwargs["desc"]) - if "unit" in kwargs: - progress_bar.set_description(kwargs["unit"]) - return progress_bar - if progress_bar is not None and not isinstance(progress_bar, (bool, dict)): - raise ValueError( - "You must pass `progress_bar` as either a bool, dict or None. None or False disables it." - ) - if progress_bar is True: - progress_bar: Optional[Dict] = dict() - elif progress_bar is False: - progress_bar: Optional[Dict] = None - if progress_bar is not None and not isinstance(progress_bar, dict): - raise ValueError( - "You must pass `progress_bar` as either a bool, dict or None. None or False disables it." - ) - if progress_bar is None: - progress_bar: Dict = dict(disable=True) - elif isinstance(progress_bar, dict) and len(kwargs) > 0: - if prefer_kwargs is True: - progress_bar: Dict = { - **progress_bar, - **kwargs, - } - else: - progress_bar: Dict = { - **kwargs, - **progress_bar, - } - assert isinstance(progress_bar, dict) - return ProgressBar(**progress_bar) - - def update(self, n: int = 1) -> Optional[bool]: - self._pending_updates += n - if abs(self._pending_updates) >= self.miniters: - out = self.pbar.update(n=self._pending_updates) - self.refresh() - self._pending_updates = 0 - return out - else: - return None - - def set_n(self, new_n: int): - self.pbar.update(n=new_n - self.pbar.n) - self._pending_updates = 0 ## Clear all updates after setting new value - self.refresh() - - def set_total(self, new_total: int): - self.pbar.total = new_total - self._pending_updates = 0 ## Clear all updates after setting new value - self.refresh() - - def set_description(self, desc: Optional[str] = None, refresh: Optional[bool] = True): - out = self.pbar.set_description(desc=desc, refresh=refresh) - self.refresh() - return out - - def set_unit(self, new_unit: str): - self.pbar.unit = new_unit - self.refresh() - - def success(self, desc: Optional[str] = None, close: bool = True, append_desc: bool = True): - self._complete_with_status( - color="#43a047", ## Dark Green - desc=desc, - close=close, - append_desc=append_desc, - ) - - def stopped(self, desc: Optional[str] = None, close: bool = True, append_desc: bool = True): - self._complete_with_status( - color="#b0bec5", ## Dark Grey - desc=desc, - close=close, - append_desc=append_desc, - ) - - def failed(self, desc: Optional[str] = None, close: bool = True, append_desc: bool = True): - self._complete_with_status( - color="#e64a19", ## Dark Red - desc=desc, - close=close, - append_desc=append_desc, - ) - - def _complete_with_status( - self, - color: str, - desc: Optional[str], - close: bool, - append_desc: bool, - ): - if not self.pbar.disable: - self.pbar.update(n=self._pending_updates) - self._pending_updates = 0 - self.color = color - self.pbar.colour = color - if desc is not None: - if append_desc: - desc: str = f"[{desc}] {self.pbar.desc}" - self.pbar.desc = desc - self.pbar.refresh() - if close: - self.close() - - def refresh(self): - self.pbar.colour = self.color - self.pbar.refresh() - - def close(self): - self.pbar.refresh() - self.pbar.close() - self.pbar.refresh() - - def __del__(self): - self.pbar.close() - - -def create_progress_bar( - *, - style: Optional[Literal["auto", "notebook", "std"]] = "auto", - unit: str = "row", - ncols: int = 100, - smoothing: float = 0.1, - **kwargs, -) -> TqdmProgressBar: - try: - if style == "auto": - with optional_dependency("ipywidgets"): - ncols: Optional[int] = None - return AutoTqdmProgressBar(ncols=ncols, unit=unit, smoothing=smoothing, **kwargs) - elif style == "notebook": - with optional_dependency("ipywidgets"): - ncols: Optional[int] = None - return NotebookTqdmProgressBar(ncols=ncols, unit=unit, smoothing=smoothing, **kwargs) - elif _IS_RAY_INSTALLED and style == "ray": - from ray.experimental import tqdm_ray - - kwargs = filter_keys( - kwargs, - keys=set(get_fn_spec(tqdm_ray.tqdm).args + get_fn_spec(tqdm_ray.tqdm).kwargs), - how="include", - ) - return tqdm_ray.tqdm(**kwargs) - else: - return StdTqdmProgressBar(ncols=ncols, unit=unit, smoothing=smoothing, **kwargs) - except Exception as e: - kwargs["style"] = style - kwargs["unit"] = unit - kwargs["ncols"] = ncols - kwargs["smoothing"] = smoothing - raise ValueError( - f"Error: could not create progress bar using settings: {kwargs}. Stack trace:\n{String.format_exception_msg(e)}" - ) diff --git a/src/fmcore/util/language/_selection.py b/src/fmcore/util/language/_selection.py deleted file mode 100644 index fa604d4..0000000 --- a/src/fmcore/util/language/_selection.py +++ /dev/null @@ -1,409 +0,0 @@ -import math -import random -from typing import * - -import numpy as np -import pandas as pd -from pydantic import confloat, conint -from pydantic.typing import Literal - -from ._import import optional_dependency -from ._structs import as_list, as_set, flatten1d, is_dict_like, is_list_like, is_set_like, is_sorted -from ._typing import type_str -from ._utils import get_default, is_null, is_scalar - -FractionalBool = Union[confloat(ge=0.0, le=1.0), bool] -SampleSizeType = Union[confloat(gt=0.0, le=1.0), conint(gt=1)] - - -def resolve_fractional_bool(fractional_bool: Optional[FractionalBool], seed: int = None) -> bool: - if fractional_bool in {0.0, False, None}: - return False - elif fractional_bool in {1.0, False, True}: - return True - else: - rnd: float = np.random.RandomState(seed=seed).random() - return rnd <= fractional_bool - - -def resolve_sample_size(sample_size: Optional[SampleSizeType], length: int) -> conint(ge=0): - if sample_size in {1.0, True}: - n = length - elif 0.0 < sample_size < 1.0: - n: int = math.ceil(sample_size * length) ## Use at least 1 row. - elif isinstance(sample_size, int) and 1 < sample_size: - n: int = sample_size - else: - raise ValueError(f"Invalid value for `sample_size`: {sample_size}") - n: int = min(n, length) - return n - - -def infer_np_dtype( - data: Union[List, np.ndarray, pd.Series, "torch.Tensor"], - sample_size: SampleSizeType = True, - str_to_object: bool = True, - return_str_for_collection: bool = False, -) -> Optional[Union[np.dtype, Type, str]]: - """ - Fast inference of the numpy dtype in a list. - Note: we cannot use pandas.api.types.infer_dtype because it returns Pandas dtypes, not numpy. - - :param data: data collection (usually a list or tuple). - :param sample_size: amount of data to subsample (without replacement) in order to determine the dtype. - If False, it will not subsample data. If True, it will use entire data. - If 0.0 < sample < 1.0, then we will subsample a fraction of the data. - If 1 <= sample, we will subsample these many rows of data. - :param str_to_object: whether to treat string as objects rather than np.unicode_ (like "U<1"). - :param return_str_for_collection: whether to return the string 'collection' for collections like list, set, - numpy array, etc. - :return: - """ - if isinstance(data, (np.ndarray, pd.Series)): - return data.dtype - with optional_dependency("torch"): - import torch - - from ._structs import TORCH_TO_NUMPY_DTYPE_MAP - - if isinstance(data, torch.Tensor): - return TORCH_TO_NUMPY_DTYPE_MAP[data.dtype] - - data: List = as_list(data) - dtypes: Set[Union[Type, np.dtype]] = set() - has_nulls: bool = False - for x in random_sample(data, n=sample_size, replacement=False): - if str_to_object and np.issubdtype(type(x), np.character): - ## Fast convert str, np.str_ and np.unicode_ to object: - return object - if not is_scalar(x): - ## Fast return for collections such as list, tuple, dict, set, np.ndarray, Tensors. - if return_str_for_collection: - return "collection" - return object - if is_null(x): ## Checks NaNs, None, and pd.NaT - has_nulls: bool = True - else: - dtypes.add(type(x)) - if len(dtypes) == 0: - ## All NaNs / None - return None - elif len(dtypes) == 1: - dtype = next(iter(dtypes)) - ## Ref: https://numpy.org/doc/stable/reference/arrays.dtypes.html#Built-in%20Python%20types - if dtype in {bool, np.bool_, float, np.float_, complex, np.complex_, bytes}: - return np.dtype(dtype) - return _np_dtype_fallback(dtypes, has_nulls=has_nulls, str_to_object=str_to_object) - - -def _np_dtype_fallback(dtypes: Union[Type, Set[Type]], has_nulls: bool, str_to_object: bool): - ## We have one or more dtypes, which might be Python types or Numpy dtypes. - ## We will now check if all the dtypes have a common parent, based on the NumPy scalar types hierarchy: - ## i.e. https://numpy.org/doc/stable/reference/arrays.scalars.html - if all_are_np_subtypes( - dtypes, - { - np.bool_, - }, - ): - if has_nulls: - return np.float_ ## Converts None to NaN, and True/False to 1.0/0.0 - return np.bool_ - elif all_are_np_subtypes(dtypes, {np.bool_, np.integer}): - if has_nulls: - return np.float_ ## Converts None to NaN, True/False to 1.0/0.0, and 123 to 123.0 - return np.int_ - elif all_are_np_subtypes(dtypes, {np.bool_, np.integer, np.floating}): - return np.float_ - elif all_are_np_subtypes( - dtypes, - { - np.character, - }, - ): - if str_to_object: - return object - return np.unicode_ - elif all_are_np_subtypes(dtypes, {np.bool_, np.integer, np.floating, np.complex_}): - return np.complex_ - ## Multiple, heterogeneous and incompatible types, return as object - return object - - -def all_are_np_subtypes( - dtypes: Union[Type, Set[Type]], - parent_dtypes: Union[Type, Set[Type]], -) -> bool: - ## Note: the following hold for Python types when checking with np.issubdtype: - ## np.issubdtype(bool, np.bool_) is True - ## np.issubdtype(int, np.integer) is True (however, np.issubdtype(bool, np.integer) is False) - ## np.issubdtype(float, np.floating) is True (however, np.issubdtype(int, np.floating) is False) - ## np.issubdtype(complex, np.complex_) is True (however, np.issubdtype(float, np.complex_) is False) - ## np.issubdtype(str, np.character) is True - dtypes: Set[Type] = as_set(dtypes) - parent_dtypes: Set[Type] = as_set(parent_dtypes) - return all( - {any({np.issubdtype(dtype, parent_dtype) for parent_dtype in parent_dtypes}) for dtype in dtypes} - ) - - -def random_sample( - data: Union[List, Tuple, Set, np.ndarray], - n: SampleSizeType, - *, - replacement: bool = False, - seed: Optional[int] = None, -) -> Union[List, np.ndarray]: - """ - Sample data randomly from a list or numpy array, with or without replacement. - :param data: list or numpy array to randomly subsample. - :param n: size of the sample to return. - :param replacement: whether to sample with replacement or not. - :param seed: optional random seed to use for reproducibility. - :return: list or numpy array of randomly-sampled data. - """ - np_random = np.random.RandomState(seed) - py_random = random.Random(seed) - if is_set_like(data): - data: List = list(data) - if not is_list_like(data): - raise ValueError( - f"Input `data` must be {list}, {tuple} or {np.ndarray}; found object of type {type(data)}" - ) - if len(data) == 1: - return data - l: Union[List, np.ndarray] = data - length: int = len(l) - n: int = resolve_sample_size(sample_size=n, length=length) - if replacement: - ## Subsample with replacement: - ## Ref: https://stackoverflow.com/a/71892814/4900327 - if isinstance(l, (list, tuple)): - if n < 50: - return py_random.choices(l, k=n) - else: - return [l[idx] for idx in np_random.randint(0, len(l), n)] - elif isinstance(l, np.ndarray): - if n < 25: - return [l[idx] for idx in (py_random.randrange(0, len(l)) for _ in range(n))] - else: - return np_random.choice(l, n, replace=True) - else: - ## Subsample without replacement: - ## Ref: https://stackoverflow.com/a/71892814/4900327 - if isinstance(l, (list, tuple)): - return py_random.sample(l, n) - elif isinstance(l, np.ndarray): - return np_random.choice(l, n, replace=False) - raise NotImplementedError(f"Unsupported input data type: {type(data)}") - - -def values_dist(vals: Union[List, Tuple, np.ndarray, pd.Series]) -> pd.Series: - assert isinstance(vals, (list, tuple, np.ndarray, pd.Series)) - val_counts: pd.Series = pd.Series(Counter(vals)) ## Includes nan and None as keys. - return val_counts / val_counts.sum() - - -def sample_idxs_match_distribution( - source: Union[List, Tuple, np.ndarray, pd.Series], - target: Union[List, Tuple, np.ndarray, pd.Series], - n: Optional[int] = None, - seed: Optional[int] = None, - shuffle: bool = True, - target_is_dist: bool = False, -) -> np.ndarray: - """ - Values from current series based on another distribution, and return randomly-shuffled indexes from the source. - Selecting these indexes will give a distribution from the source whicha matches that of the target distribution. - """ - if not target_is_dist: - target_prob_dist: pd.Series = values_dist(target) - else: - target_prob_dist: pd.Series = target - assert isinstance(target_prob_dist, pd.Series) - assert ( - abs(float(target_prob_dist.sum()) - 1.0) <= 1e-2 - ) ## Sum of probs should be exactly or very close to 1. - - assert isinstance(source, (list, tuple, np.ndarray, pd.Series)) - source_vc: pd.Series = pd.Series(Counter(source)) - # print(f'\nsource_vc:\n{source_vc}') - # print(f'\ntarget_prob_dist:\n{target_prob_dist}') - missing_source_vals: Set = set(target_prob_dist.index) - set(source_vc.index) - if len(missing_source_vals) > 0: - raise ValueError( - f"Cannot sample; the following values are missing in the source: {missing_source_vals}" - ) - - n: int = get_default(n, len(source)) - max_n_sample: pd.Series = (source_vc / target_prob_dist).apply( - lambda max_n_sample_category: min(max_n_sample_category, n), - ) - # print(f'\n\nmax_n_sample:\n{max_n_sample}') - max_n_sample: int = math.floor(min(max_n_sample.dropna())) - # print(f'Max possible sample size: {max_n_sample}') - source_value_wise_count_to_sample: pd.Series = (target_prob_dist * max_n_sample).round(0).astype(int) - source_value_wise_count_to_sample: Dict[Any, int] = source_value_wise_count_to_sample.to_dict() - ## Select random indexes: - source_val_idxs: Dict[Any, List[int]] = {val: [] for val in source_vc.index} - for idx, val in enumerate(source): - if val in source_value_wise_count_to_sample: - source_val_idxs[val].append(idx) - sampled_idxs: np.array = np.array( - flatten1d( - [ - random_sample(source_val_idxs[val], n=req_source_val_count, seed=seed) - for val, req_source_val_count in source_value_wise_count_to_sample.items() - ] - ) - ) - if shuffle: - sampled_idxs: np.ndarray = np.random.RandomState(seed).permutation(sampled_idxs) - return sampled_idxs - - -def random_cartesian_product(*lists, seed: Optional[int] = None, n: int): - rnd = random.Random(seed) - cartesian_idxs: Set[Tuple[int, ...]] = set() - list_lens: List[int] = [len(l) for l in lists] - max_count: int = 1 - for l_len in list_lens: - max_count *= l_len - if max_count < n: - raise ValueError(f"At most {max_count} cartesian product elements can be created.") - while len(cartesian_idxs) < n: - rnd_idx: Tuple[int, ...] = tuple(rnd.randint(0, l_len - 1) for l_len in list_lens) - if rnd_idx not in cartesian_idxs: - cartesian_idxs.add(rnd_idx) - elem = [] - for l_idx, l in zip(rnd_idx, lists): - elem.append(l[l_idx]) - yield elem - - -def argmax(d: Union[List, Tuple, np.ndarray, Dict, Set]) -> Any: - if is_set_like(d): - raise ValueError(f"Cannot get argmax from a {type_str(d)}.") - if is_dict_like(d): - ## Get key pertaining to max value: - return max(d, key=d.get) - assert is_list_like(d) - return max([(i, x) for (i, x) in enumerate(d)], key=lambda x: x[1])[0] - - -def argmin(d: Union[List, Tuple, np.ndarray, Dict, Set]) -> Any: - if is_set_like(d): - raise ValueError(f"Cannot get argmin from a {type_str(d)}.") - if is_dict_like(d): - ## Get key pertaining to max value: - return min(d, key=d.get) - assert is_list_like(d) - return min([(i, x) for (i, x) in enumerate(d)], key=lambda x: x[1])[0] - - -def best_k( - vals: np.ndarray, - k: int, - *, - how: Literal["min", "max"], - sort: Optional[Literal["ascending", "descending"]] = None, - indexes_only: bool = False, -) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: - """Efficiently gets the top-k elements from a numpy array.""" - assert isinstance(k, int) and k > 0 - ## np.argpartition creates a new array with the top-k/bottom-k scores in the head/tail k elements, - ## but these k are not actually sorted. - if how == "min": - sort: str = sort if sort is not None else "ascending" - bottom_k_idxs: np.ndarray = np.argpartition(vals, k, axis=0)[:k] - ## Index vals to get bottom-k values, unsorted: - bottom_k_vals: np.ndarray = vals[bottom_k_idxs] - ## Get argsorted indexes for the bottom-k values (between 1 & k). - ## We then use this to index the bottom-k-indexes array: - if sort == "ascending": - bottom_k_idxs_sorted: np.ndarray = bottom_k_idxs[bottom_k_vals.argsort(axis=0)] - bottom_k_vals_sorted = np.sort(bottom_k_vals, axis=0) - elif sort == "descending": - bottom_k_idxs_sorted: np.ndarray = bottom_k_idxs[bottom_k_vals.argsort(axis=0)[::-1]] - bottom_k_vals_sorted = np.sort(bottom_k_vals, axis=0)[::-1] - else: - raise NotImplementedError(f"Unsupported value of `sort`: {sort}") - # print(f'bottom_k_vals_sorted: {bottom_k_vals_sorted}') - # print(f'bottom_k_idxs_sorted: {bottom_k_idxs_sorted}') - # assert bool((vals[bottom_k_idxs_sorted] == bottom_k_vals_sorted).all()) - if indexes_only: - return bottom_k_idxs_sorted - return bottom_k_idxs_sorted, bottom_k_vals_sorted - elif how == "max": - sort: str = sort if sort is not None else "descending" - top_k_idxs: np.ndarray = np.argpartition(vals, -k, axis=0)[-k:] - ## Index vals to get top-k values, unsorted: - top_k_vals: np.ndarray = vals[top_k_idxs] - ## Get argsorted indexes for the top-k values (between 1 & k). - ## We then use this to index the top-k-indexes array: - if sort == "ascending": - top_k_idxs_sorted: np.ndarray = top_k_idxs[top_k_vals.argsort(axis=0)] - top_k_vals_sorted = np.sort(top_k_vals, axis=0) - elif sort == "descending": - top_k_idxs_sorted: np.ndarray = top_k_idxs[top_k_vals.argsort(axis=0)[::-1]] - top_k_vals_sorted = np.sort(top_k_vals, axis=0)[::-1] - else: - raise NotImplementedError(f"Unsupported value of `sort`: {sort}") - # print(f'top_k_vals_sorted: {top_k_vals_sorted}') - # print(f'top_k_idxs_sorted: {top_k_idxs_sorted}') - # assert bool((vals[top_k_idxs_sorted] == top_k_vals_sorted).all()) - if indexes_only: - return top_k_idxs_sorted - return top_k_idxs_sorted, top_k_vals_sorted - else: - raise ValueError(f"Unsupported value for `how`: {how}") - - -def shuffle_items( - struct: Union[List, Tuple, Set, Dict, str], - *, - seed: Optional[int] = None, - dict_return_values: bool = False, -) -> Generator[Any, None, None]: - if is_set_like(struct): - struct: Tuple = tuple(struct) - elif is_dict_like(struct): - if dict_return_values: - struct: Tuple = tuple(struct.values()) - else: - struct: Tuple = tuple(struct.items()) - rnd_idxs: List[int] = list(range(len(struct))) - random.Random(seed).shuffle(rnd_idxs) - for rnd_idx in rnd_idxs: - yield struct[rnd_idx] - - -_Comparable = Union[int, float, str] - - -def binary_search( - l: Union[List[_Comparable], Tuple[_Comparable, ...]], - target: _Comparable, - *, - return_tuple: bool = False, -) -> Union[Tuple[Optional[_Comparable], Optional[_Comparable]], _Comparable]: - if not is_sorted(l): - l: List[_Comparable] = sorted(l) - low: int = 0 - high: int = len(l) - 1 - while low <= high: - mid = (low + high) // 2 - if l[mid] == target: - if return_tuple: - return l[mid], l[mid] - return l[mid] - elif l[mid] < target: - low: int = mid + 1 - else: - high: int = mid - 1 - - ## When the target is not found, set lower and upper bounds - lower: _Comparable = l[high] if high >= 0 else None - upper: _Comparable = l[low] if low < len(l) else None - - return lower, upper diff --git a/src/fmcore/util/language/_string.py b/src/fmcore/util/language/_string.py deleted file mode 100644 index e51abf6..0000000 --- a/src/fmcore/util/language/_string.py +++ /dev/null @@ -1,10099 +0,0 @@ -import functools -import inspect -import io -import json -import math -import pprint -import random -import re -import string -import types -from ast import literal_eval -from collections import defaultdict -from datetime import datetime, timedelta -from hashlib import sha256 -from typing import * - -import numpy as np -import pandas as pd -from pydantic import confloat, conint, validate_arguments - -from ._function import is_function -from ._import import optional_dependency - -StructuredBlob = Union[List, Dict, List[Dict]] ## used for type hints. -KERNEL_START_DT: datetime = datetime.now() - -_PUNCTUATION_REMOVAL_TABLE = str.maketrans( - "", - "", - string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE = str.maketrans( - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "abcdefghijklmnopqrstuvwxyz", - string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_SPACE = str.maketrans( - "", - "", - " " + string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_SPACE = str.maketrans( - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "abcdefghijklmnopqrstuvwxyz", - " " + string.punctuation, ## Will be removed -) - -_PUNCTUATION_REMOVAL_TABLE_WITH_NUMBERS = str.maketrans( - "", - "", - "1234567890" + string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_NUMBERS = str.maketrans( - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "abcdefghijklmnopqrstuvwxyz", - "1234567890" + string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_SPACE_AND_NUMBERS = str.maketrans( - "", - "", - "1234567890 " + string.punctuation, ## Will be removed -) -_PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_SPACE_AND_NUMBERS = str.maketrans( - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "abcdefghijklmnopqrstuvwxyz", - "1234567890 " + string.punctuation, ## Will be removed -) - - -class NeverFailJsonEncoder(json.JSONEncoder): - def default(self, obj): - # print(f'Running NeverFailJsonEncoder') - if isinstance(obj, (np.integer, int)): - return int(obj) - elif isinstance(obj, (np.bool_, bool)): - return bool(obj) - elif isinstance(obj, (np.floating, float)): - return float(obj) - elif isinstance(obj, (np.ndarray, pd.Series, list, set, tuple)): - return obj.tolist() - elif isinstance(obj, complex): - return obj.real, obj.imag - elif isinstance( - obj, - ( - types.FunctionType, - types.MethodType, - types.BuiltinFunctionType, - types.BuiltinMethodType, - types.LambdaType, - functools.partial, - ), - ): - return {"": f"{obj.__module__}.{obj.__qualname__}{inspect.signature(obj)}"} - with optional_dependency("torch"): - import torch - - if isinstance(obj, torch.dtype): - return str(obj) - try: - return super(NeverFailJsonEncoder, self).default(obj) - except TypeError: - obj_members: List[str] = [] - for k, v in obj.__dict__.items(): - if is_function(v): - continue - k_str: str = str(k) - v_str: str = "..." - obj_members.append(f"{k_str}={v_str}") - obj_members_str: str = ", ".join(obj_members) - return f"{obj.__class__.__name__}({obj_members_str})" - - -## Taken from: https://github.com/django/django/blob/master/django/utils/baseconv.py#L101 -class BaseConverter: - decimal_digits: str = "0123456789" - - def __init__(self, digits, sign="-"): - self.sign = sign - self.digits = digits - if sign in self.digits: - raise ValueError("Sign character found in converter base digits.") - - def __repr__(self): - return "<%s: base%s (%s)>" % (self.__class__.__name__, len(self.digits), self.digits) - - def encode(self, i): - neg, value = self.convert(i, self.decimal_digits, self.digits, "-") - if neg: - return self.sign + value - return value - - def decode(self, s): - neg, value = self.convert(s, self.digits, self.decimal_digits, self.sign) - if neg: - value = "-" + value - return int(value) - - def convert(self, number, from_digits, to_digits, sign): - if str(number)[0] == sign: - number = str(number)[1:] - neg = 1 - else: - neg = 0 - - # make an integer out of the number - x = 0 - for digit in str(number): - x = x * len(from_digits) + from_digits.index(digit) - - # create the result in base 'len(to_digits)' - if x == 0: - res = to_digits[0] - else: - res = "" - while x > 0: - digit = x % len(to_digits) - res = to_digits[digit] + res - x = int(x // len(to_digits)) - return neg, res - - -class String: - def __init__(self): - raise TypeError(f'Cannot instantiate utility class "{str(self.__class__)}"') - - EMPTY: str = "" - SPACE: str = " " - DOUBLE_SPACE: str = SPACE * 2 - FOUR_SPACE: str = SPACE * 4 - TAB: str = "\t" - NEWLINE: str = "\n" - WINDOWS_NEWLINE: str = "\r" - BACKSLASH: str = "\\" - SLASH: str = "/" - PIPE: str = "|" - SINGLE_QUOTE: str = "'" - DOUBLE_QUOTE: str = '"' - COMMA: str = "," - COMMA_SPACE: str = ", " - COMMA_NEWLINE: str = ",\n" - HYPHEN: str = "-" - DOUBLE_HYPHEN: str = "--" - DOT: str = "." - ASTERISK: str = "*" - DOUBLE_ASTERISK: str = "**" - QUESTION_MARK: str = "?" - CARET: str = "^" - DOLLAR: str = "$" - UNDERSCORE: str = "_" - COLON: str = ":" - SEMICOLON: str = ";" - EQUALS: str = "=" - LEFT_PAREN: str = "(" - RIGHT_PAREN: str = ")" - BACKTICK: str = "`" - TILDE: str = "~" - - MATCH_ALL_REGEX_SINGLE_LINE: str = CARET + DOT + ASTERISK + DOLLAR - MATCH_ALL_REGEX_MULTI_LINE: str = DOT + ASTERISK - - S3_PREFIX: str = "s3://" - FILE_PREFIX: str = "file://" - HTTP_PREFIX: str = "http://" - HTTPS_PREFIX: str = "https://" - PORT_REGEX: str = ":(\d+)" - DOCKER_REGEX: str = "\d+\.dkr\.ecr\..*.amazonaws\.com/.*" - - DEFAULT_CHUNK_NAME_PREFIX: str = "part" - - FILES_TO_IGNORE: str = ["_SUCCESS", ".DS_Store"] - - UTF_8: str = "utf-8" - - FILE_SIZE_UNITS: Tuple[str, ...] = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - ## FILE_SIZE_REGEX taken from: https://rgxdb.com/r/4IG91ZFE - ## Matches: "2", "2.5", "2.5b", "2.5B", "2.5k", "2.5K", "2.5kb", "2.5Kb", "2.5KB", "2.5kib", "2.5KiB", "2.5kiB" - ## Does not match: "2.", "2ki", "2ib", "2.5KIB" - FILE_SIZE_REGEX = r"^(\d*\.?\d+)((?=[KMGTkgmt])([KMGTkgmt])(?:i?[Bb])?|[Bb]?)$" - - ALPHABET: Tuple[str, ...] = tuple("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") - ALPHABET_CAPS: Tuple[str, ...] = tuple("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") - ALPHABET_CAPS_NO_DIGITS: Tuple[str, ...] = tuple("ABCDEFGHIJKLMNOPQRSTUVWXYZ") - - BASE2_ALPHABET: str = "01" - BASE16_ALPHABET: str = "0123456789ABCDEF" - BASE56_ALPHABET: str = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnpqrstuvwxyz" - BASE36_ALPHABET: str = "0123456789abcdefghijklmnopqrstuvwxyz" - BASE62_ALPHABET: str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - BASE64_ALPHABET: str = BASE62_ALPHABET + "-_" - - BASE_CONVERTER_MAP: Dict[int, BaseConverter] = { - 2: BaseConverter(BASE2_ALPHABET), - 16: BaseConverter(BASE16_ALPHABET), - 36: BaseConverter(BASE36_ALPHABET), - 56: BaseConverter(BASE56_ALPHABET), - 62: BaseConverter(BASE62_ALPHABET), - 64: BaseConverter(BASE64_ALPHABET, sign="$"), - } - - @classmethod - def str_normalize( - cls, x: str, *, remove: Optional[Union[str, Tuple, List, Set]] = (" ", "-", "_") - ) -> str: - ## Found to be faster than .translate() and re.sub() on Python 3.10.6 - if remove is None: - remove: Set[str] = set() - if isinstance(remove, str): - remove: Set[str] = set(remove) - assert isinstance(remove, (list, tuple, set)) - if len(remove) == 0: - return str(x).lower() - out: str = str(x) - for rem in set(remove).intersection(set(out)): - out: str = out.replace(rem, "") - out: str = out.lower() - return out - - @classmethod - def punct_normalize( - cls, x: str, *, lowercase: bool = True, space: bool = True, numbers: bool = False - ) -> str: - punct_table = { - (False, False, False): _PUNCTUATION_REMOVAL_TABLE, - (True, False, False): _PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE, - (False, True, False): _PUNCTUATION_REMOVAL_TABLE_WITH_SPACE, - (True, True, False): _PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_SPACE, - (False, False, True): _PUNCTUATION_REMOVAL_TABLE_WITH_NUMBERS, - (True, False, True): _PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_NUMBERS, - (False, True, True): _PUNCTUATION_REMOVAL_TABLE_WITH_SPACE_AND_NUMBERS, - (True, True, True): _PUNCTUATION_REMOVAL_TABLE_WITH_LOWERCASE_AND_SPACE_AND_NUMBERS, - }[(lowercase, space, numbers)] - return str(x).translate(punct_table) - - @classmethod - def whitespace_normalize(cls, text: str, remove_newlines: bool = False): - ## Remove trailing whitespace at the end of each line - text: str = re.sub(r"\s+$", "", text, flags=re.MULTILINE) - - if remove_newlines: - text: str = text.replace("\n", "") - else: - ## Replace double newlines with single newlines - text: str = re.sub(r"\n\n+", "\n", text) - - ## Replace double spaces with single spaces - text: str = re.sub(r" +", " ", text) - return text.strip() - - @classmethod - def format_exception_msg(cls, ex: Exception, short: bool = False, prefix: str = "[ERROR]") -> str: - ## Ref: https://stackoverflow.com/a/64212552 - tb = ex.__traceback__ - trace = [] - while tb is not None: - trace.append( - { - "filename": tb.tb_frame.f_code.co_filename, - "function_name": tb.tb_frame.f_code.co_name, - "lineno": tb.tb_lineno, - } - ) - tb = tb.tb_next - out = f'{prefix}: {type(ex).__name__}: "{str(ex)}"' - if short: - out += "\nTrace: " - for trace_line in trace: - out += f"{trace_line['filename']}#{trace_line['lineno']}; " - else: - out += "\nTraceback:" - for trace_line in trace: - out += f"\n\t{trace_line['filename']} line {trace_line['lineno']}, in {trace_line['function_name']}..." - return out.strip() - - @classmethod - def str_format_args(cls, x: str, named_only: bool = True) -> List[str]: - ## Ref: https://stackoverflow.com/a/46161774/4900327 - args: List[str] = [str(tup[1]) for tup in string.Formatter().parse(x) if tup[1] is not None] - if named_only: - args: List[str] = [arg for arg in args if not arg.isdigit() and len(arg) > 0] - return args - - @classmethod - def assert_not_empty_and_strip(cls, string: str, error_message: str = "") -> str: - cls.assert_not_empty(string, error_message) - return string.strip() - - @classmethod - def strip_if_not_empty(cls, string: str) -> str: - if cls.is_not_empty(string): - return string.strip() - return string - - @classmethod - def is_not_empty(cls, string: str) -> bool: - return isinstance(string, str) and len(string.strip()) > 0 - - @classmethod - def is_not_empty_bytes(cls, string: bytes) -> bool: - return isinstance(string, bytes) and len(string.strip()) > 0 - - @classmethod - def is_not_empty_str_or_bytes(cls, string: Union[str, bytes]) -> bool: - return cls.is_not_empty(string) or cls.is_not_empty_bytes(string) - - @classmethod - def is_empty(cls, string: Any) -> bool: - return not cls.is_not_empty(string) - - @classmethod - def is_empty_bytes(cls, string: Any) -> bool: - return not cls.is_not_empty_bytes(string) - - @classmethod - def is_empty_str_or_bytes(cls, string: Any) -> bool: - return not cls.is_not_empty_str_or_bytes(string) - - @classmethod - def assert_not_empty(cls, string: Any, error_message: str = ""): - assert cls.is_not_empty(string), error_message - - @classmethod - def assert_not_empty_bytes(cls, string: Any, error_message: str = ""): - assert cls.is_not_empty_str_or_bytes(string), error_message - - @classmethod - def assert_not_empty_str_or_bytes(cls, string: Any, error_message: str = ""): - assert cls.is_not_empty_str_or_bytes(string), error_message - - @classmethod - def is_int(cls, string: Any) -> bool: - """ - Checks if an input string is an integer. - :param string: input string - :raises: error when input is not a string - :return: True for '123', '-123' but False for '123.0', '1.23', '-1.23' and '1e2' - """ - try: - int(string) - return True - except Exception: - return False - - @classmethod - def is_float(cls, string: Any) -> bool: - """ - Checks if an input string is a floating-point value. - :param string: input string - :raises: error when input is not a string - :return: True for '123', '1.23', '123.0', '-123', '-123.0', '1e2', '1.23e-5', 'NAN' & 'nan'; but False for 'abc' - """ - try: - float(string) ## Will return True for NaNs as well. - return True - except Exception: - return False - - @classmethod - def is_prefix(cls, prefix: str, strings: Union[List[str], Set[str]]) -> bool: - cls.assert_not_empty(prefix) - if isinstance(strings, str): - strings = [strings] - return True in {string.startswith(prefix) for string in strings} - - @classmethod - def remove_prefix(cls, string: str, prefix: str) -> str: - cls.assert_not_empty(prefix) - if string.startswith(prefix): - string = string[len(prefix) :] - return string - - @classmethod - def remove_suffix(cls, string: str, suffix: str) -> str: - cls.assert_not_empty(suffix) - if string.endswith(suffix): - string = string[: -len(suffix)] - return string - - @classmethod - def join_human( - cls, - l: Union[List, Tuple, Set], - sep: str = ",", - final_join: str = "and", - oxford_comma: bool = False, - ) -> str: - l: List = list(l) - if len(l) == 1: - return str(l[0]) - out: str = "" - for x in l[:-1]: - out += " " + str(x) + sep - if not oxford_comma: - out: str = cls.remove_suffix(out, sep) - x = l[-1] - out += f" {final_join} " + str(x) - return out.strip() - - @classmethod - def convert_str_to_type(cls, val: str, expected_type: Type) -> Any: - assert isinstance(expected_type, type) - if isinstance(val, expected_type): - return val - if expected_type == str: - return str(val) - if expected_type == bool and isinstance(val, str): - val = val.lower().strip().capitalize() ## literal_eval does not parse "false", only "False". - out = literal_eval(String.assert_not_empty_and_strip(str(val))) - if expected_type == float and isinstance(out, int): - out = float(out) - if expected_type == int and isinstance(out, float) and int(out) == out: - out = int(out) - if expected_type == tuple and isinstance(out, list): - out = tuple(out) - if expected_type == list and isinstance(out, tuple): - out = list(out) - if expected_type == set and isinstance(out, (list, tuple)): - out = set(out) - if expected_type == bool and out in [0, 1]: - out = bool(out) - if type(out) != expected_type: - raise ValueError(f"Input value {val} cannot be converted to {str(expected_type)}") - return out - - @classmethod - def readable_bytes(cls, size_in_bytes: int, decimals: int = 3) -> str: - sizes: Dict[str, float] = cls.convert_size_from_bytes(size_in_bytes, unit=None, decimals=decimals) - sorted_sizes: List[Tuple[str, float]] = [ - (k, v) for k, v in sorted(sizes.items(), key=lambda item: item[1]) - ] - size_unit, size_val = None, None - for size_unit, size_val in sorted_sizes: - if size_val >= 1: - break - return f"{size_val} {size_unit}" - - @classmethod - def convert_size_from_bytes( - cls, - size_in_bytes: int, - unit: Optional[str] = None, - decimals: int = 3, - ) -> Union[Dict, float]: - size_in_bytes = float(size_in_bytes) - cur_size = size_in_bytes - sizes = {} - if size_in_bytes == 0: - for size_name in cls.FILE_SIZE_UNITS: - sizes[size_name] = 0.0 - else: - for size_name in cls.FILE_SIZE_UNITS: - val: float = round(cur_size, decimals) - i = 1 - while val == 0: - val = round(cur_size, decimals + i) - i += 1 - sizes[size_name] = val - i = int(math.floor(math.log(cur_size, 1024))) - cur_size = cur_size / 1024 - if unit is not None: - assert isinstance(unit, str) - unit = unit.upper() - assert unit in cls.FILE_SIZE_UNITS - return sizes[unit] - return sizes - - @classmethod - def convert_size_to_bytes(cls, size_in_human_readable: str) -> int: - size_in_human_readable: str = cls.assert_not_empty_and_strip(size_in_human_readable).upper() - size_selection_regex = f"""(\d+(?:\.\d+)?) *({cls.PIPE.join(cls.FILE_SIZE_UNITS)})""" ## This uses a non-capturing group: https://stackoverflow.com/a/3512530/4900327 - matches = re.findall(size_selection_regex, size_in_human_readable) - if len(matches) != 1 or len(matches[0]) != 2: - raise ValueError(f'Cannot convert value "{size_in_human_readable}" to bytes.') - val, unit = matches[0] - val = float(val) - for file_size_unit in cls.FILE_SIZE_UNITS: - if unit == file_size_unit: - return int(round(val)) - val = val * 1024 - raise ValueError(f'Cannot convert value "{size_in_human_readable}" to bytes.') - - @classmethod - def readable_seconds( - cls, - time_in_seconds: Union[float, timedelta], - *, - decimals: int = 2, - short: bool = False, - ) -> str: - if isinstance(time_in_seconds, timedelta): - time_in_seconds: float = time_in_seconds.total_seconds() - times: Dict[str, float] = cls.convert_time_from_seconds( - time_in_seconds, - unit=None, - decimals=decimals, - short=short, - ) - sorted_times: List[Tuple[str, float]] = [ - (k, v) for k, v in sorted(times.items(), key=lambda item: item[1]) - ] - time_unit, time_val = None, None - for time_unit, time_val in sorted_times: - if time_val >= 1: - break - if decimals <= 0: - time_val = int(time_val) - if short: - return f"{time_val}{time_unit}" - return f"{time_val} {time_unit}" - - @classmethod - def convert_time_from_seconds( - cls, - time_in_seconds: float, - unit: Optional[str] = None, - decimals: int = 3, - short: bool = False, - ) -> Union[Dict, float]: - TIME_UNITS = { - "nanoseconds": 1e-9, - "microseconds": 1e-6, - "milliseconds": 1e-3, - "seconds": 1.0, - "mins": 60, - "hours": 60 * 60, - "days": 24 * 60 * 60, - } - if short: - TIME_UNITS = { - "ns": 1e-9, - "us": 1e-6, - "ms": 1e-3, - "s": 1.0, - "min": 60, - "hr": 60 * 60, - "d": 24 * 60 * 60, - } - time_in_seconds = float(time_in_seconds) - times: Dict[str, float] = { - time_unit: round(time_in_seconds / TIME_UNITS[time_unit], decimals) for time_unit in TIME_UNITS - } - if unit is not None: - assert isinstance(unit, str) - unit = unit.lower() - assert unit in TIME_UNITS - return times[unit] - return times - - @classmethod - def readable_number( - cls, - n: Union[float, int], - decimals: int = 3, - short: bool = True, - scientific: bool = False, - ) -> str: - if n == 0: - return "0" - assert abs(n) > 0 - if 0 < abs(n) < 1: - scientific: bool = True - if scientific: - n_unit: str = "" - n_val: str = f"{n:.{decimals}e}" - else: - numbers: Dict[str, float] = cls.convert_number( - abs(n), - unit=None, - decimals=decimals, - short=short, - ) - sorted_numbers: List[Tuple[str, float]] = [ - (k, v) for k, v in sorted(numbers.items(), key=lambda item: item[1]) - ] - n_unit: Optional[str] = None - n_val: Optional[float] = None - for n_unit, n_val in sorted_numbers: - if n_val >= 1: - break - if decimals <= 0: - n_val: int = int(n_val) - if n_val == int(n_val): - n_val: int = int(n_val) - if n < 0: - n_val: str = f"-{n_val}" - if short: - return f"{n_val}{n_unit}".strip() - return f"{n_val} {n_unit}".strip() - - @classmethod - def convert_number( - cls, - n: float, - unit: Optional[str] = None, - decimals: int = 3, - short: bool = False, - ) -> Union[Dict, float]: - assert n >= 0 - N_UNITS = { - "": 1e0, - "thousand": 1e3, - "million": 1e6, - "billion": 1e9, - "trillion": 1e12, - "quadrillion": 1e15, - "quintillion": 1e18, - } - if short: - N_UNITS = { - "": 1e0, - "K": 1e3, - "M": 1e6, - "B": 1e9, - "T": 1e12, - "Qa": 1e15, - "Qi": 1e18, - } - n: float = float(n) - numbers: Dict[str, float] = {n_unit: round(n / N_UNITS[n_unit], decimals) for n_unit in N_UNITS} - if unit is not None: - assert isinstance(unit, str) - unit = unit.lower() - assert unit in N_UNITS - return numbers[unit] - return numbers - - @classmethod - def jsonify( - cls, - blob: StructuredBlob, - *, - minify: bool = False, - ) -> str: - if minify: - return json.dumps(blob, indent=None, separators=(cls.COMMA, cls.COLON), cls=NeverFailJsonEncoder) - else: - return json.dumps(blob, cls=NeverFailJsonEncoder, indent=4) - - @classmethod - def get_num_zeros_to_pad(cls, max_i: int) -> int: - assert isinstance(max_i, int) and max_i >= 1 - num_zeros = math.ceil(math.log10(max_i)) ## Ref: https://stackoverflow.com/a/51837162/4900327 - if max_i == 10**num_zeros: ## If it is a power of 10 - num_zeros += 1 - return num_zeros - - @classmethod - def pad_zeros(cls, i: int, max_i: int = int(1e12)) -> str: - assert isinstance(i, int) - assert i >= 0 - assert isinstance(max_i, int) - assert max_i >= i, f"Expected max_i to be >= current i; found max_i={max_i}, i={i}" - num_zeros: int = cls.get_num_zeros_to_pad(max_i) - return f"{i:0{num_zeros}}" - - @classmethod - def stringify( - cls, - d: Union[Dict, List, Tuple, Set, Any], - *, - sep: str = ",", - key_val_sep: str = "=", - literal: bool = False, - nested_literal: bool = True, - ) -> str: - if isinstance(d, (dict, defaultdict)): - if nested_literal: - out: str = sep.join( - [ - f"{k}" - f"{key_val_sep}" - f"{cls.stringify(v, sep=sep, key_val_sep=key_val_sep, literal=True, nested_literal=True)}" - for k, v in sorted(list(d.items()), key=lambda x: x[0]) - ] - ) - else: - out: str = sep.join( - [ - f"{k}" - f"{key_val_sep}" - f"{cls.stringify(v, sep=sep, key_val_sep=key_val_sep, literal=False, nested_literal=False)}" - for k, v in sorted(list(d.items()), key=lambda x: x[0]) - ] - ) - elif isinstance(d, (list, tuple, set, frozenset, np.ndarray, pd.Series)): - try: - s = sorted(list(d)) - except TypeError: ## Sorting fails - s = list(d) - out: str = sep.join( - [ - f"{cls.stringify(x, sep=sep, key_val_sep=key_val_sep, literal=nested_literal, nested_literal=nested_literal)}" - for x in s - ] - ) - else: - out: str = repr(d) - if literal: - if isinstance(d, list): - out: str = f"[{out}]" - elif isinstance(d, np.ndarray): - out: str = f"np.array([{out}])" - elif isinstance(d, pd.Series): - out: str = f"pd.Series([{out}])" - elif isinstance(d, tuple): - if len(d) == 1: - out: str = f"({out},)" - else: - out: str = f"({out})" - elif isinstance(d, (set, frozenset)): - out: str = f"({out})" - elif isinstance(d, (dict, defaultdict)): - out: str = f"dict({out})" - return out - - @classmethod - def destringify(cls, s: str) -> Any: - if isinstance(s, str): - try: - val = literal_eval(s) - except ValueError: - val = s - else: - val = s - if isinstance(val, float): - if val.is_integer(): - return int(val) - return val - return val - - @classmethod - @validate_arguments - def random( - cls, - shape: Tuple = (1,), - length: Union[conint(ge=1), Tuple[conint(ge=1), conint(ge=1)]] = 6, - spaces_prob: Optional[confloat(ge=0.0, le=1.0)] = None, - alphabet: Tuple = ALPHABET, - seed: Optional[int] = None, - unique: bool = False, - ) -> Union[str, np.ndarray]: - if isinstance(length, int): - min_num_chars: int = length - max_num_chars: int = length - else: - min_num_chars, max_num_chars = length - assert min_num_chars <= max_num_chars, ( - f"Must have min_num_chars ({min_num_chars}) <= max_num_chars ({max_num_chars})" - ) - if spaces_prob is not None: - num_spaces_to_add: int = int(round(len(alphabet) * spaces_prob / (1 - spaces_prob), 0)) - alphabet = alphabet + num_spaces_to_add * (cls.SPACE,) - - ## Ref: https://stackoverflow.com/a/25965461/4900327 - np_random = np.random.RandomState(seed=seed) - random_alphabet_lists = np_random.choice(alphabet, shape + (max_num_chars,)) - random_strings: np.ndarray = np.apply_along_axis( - arr=random_alphabet_lists, - func1d=lambda random_alphabet_list: "".join(random_alphabet_list)[ - : np_random.randint(min_num_chars, max_num_chars + 1) - ], - axis=len(shape), - ) - if shape == (1,): - return random_strings[0] - if unique: - random_strings_flatten1d: np.ndarray = random_strings.ravel() - if len(set(random_strings_flatten1d)) != len(random_strings_flatten1d): - ## Call it recursively: - random_strings: np.ndarray = cls.random( - shape=shape, - length=length, - spaces_prob=spaces_prob, - alphabet=alphabet, - seed=seed, - unique=unique, - ) - return random_strings - - @classmethod - def random_name( - cls, - count: int = 1, - *, - sep: str = HYPHEN, - order: Tuple[str, ...] = ("adjective", "verb", "noun"), - seed: Optional[int] = None, - ) -> Union[List[str], str]: - cartesian_product_parts: List[List[str]] = [] - assert len(order) > 0 - for order_part in order: - if order_part == "verb": - cartesian_product_parts.append(cls.RANDOM_VERBS) - elif order_part == "adjective": - cartesian_product_parts.append(cls.RANDOM_ADJECTIVES) - elif order_part == "noun": - cartesian_product_parts.append(cls.RANDOM_NOUNS) - else: - raise NotImplementedError(f'Unrecognized part of the order sequence: "{order_part}"') - - out: List[str] = [ - sep.join(parts) - for parts in cls.__random_cartesian_product(*cartesian_product_parts, seed=seed, n=count) - ] - if count == 1: - return out[0] - return out - - @staticmethod - def __random_cartesian_product(*lists, seed: Optional[int] = None, n: int): - rnd = random.Random(seed) - cartesian_idxs: Set[Tuple[int, ...]] = set() - list_lens: List[int] = [len(l) for l in lists] - max_count: int = 1 - for l_len in list_lens: - max_count *= l_len - if max_count < n: - raise ValueError(f"At most {max_count} cartesian product elements can be created.") - while len(cartesian_idxs) < n: - rnd_idx: Tuple[int, ...] = tuple(rnd.randint(0, l_len - 1) for l_len in list_lens) - if rnd_idx not in cartesian_idxs: - cartesian_idxs.add(rnd_idx) - elem = [] - for l_idx, l in zip(rnd_idx, lists): - elem.append(l[l_idx]) - yield elem - - @classmethod - def parse_datetime(cls, dt: Union[str, int, float, datetime]) -> datetime: - if isinstance(dt, datetime): - return dt - elif type(dt) in [int, float]: - return datetime.fromtimestamp(dt) - elif isinstance(dt, str): - return datetime.fromisoformat(dt) - raise NotImplementedError(f"Cannot parse datetime from value {dt} with type {type(dt)}") - - @classmethod - def now(cls, **kwargs) -> str: - dt: datetime = datetime.now() - return cls.readable_datetime(dt, **kwargs) - - @classmethod - def kernel_start_time(cls, **kwargs) -> str: - return cls.readable_datetime(KERNEL_START_DT, **kwargs) - - @classmethod - def readable_datetime( - cls, - dt: datetime, - *, - human: bool = False, - microsec: bool = True, - tz: bool = True, - **kwargs, - ) -> str: - dt: datetime = dt.replace(tzinfo=dt.astimezone().tzinfo) - if human: - format_str: str = "%d%b%Y-%H:%M:%S" - microsec: bool = False - else: - format_str: str = "%Y-%m-%dT%H:%M:%S" - if microsec: - format_str += ".%f" - split_tz_colon: bool = False - if tz and dt.tzinfo is not None: - if human: - format_str += "+%Z" - else: - format_str += "%z" - split_tz_colon: bool = True - out: str = dt.strftime(format_str).strip() - if split_tz_colon: ## Makes the output exactly like dt.isoformat() - out: str = out[:-2] + ":" + out[-2:] - return out - - @classmethod - def convert_integer_to_base_n_str(cls, integer: int, base: int) -> str: - assert isinstance(integer, int) - assert isinstance(base, int) and base in cls.BASE_CONVERTER_MAP, ( - f"Param `base` must be an integer in {list(cls.BASE_CONVERTER_MAP.keys())}; found: {base}" - ) - return cls.BASE_CONVERTER_MAP[base].encode(integer) - - @classmethod - def hash(cls, val: Union[str, int, float, List, Dict], max_len: int = 256, base: int = 62) -> str: - """ - Constructs a hash of a JSON object or value. - :param val: any valid JSON value (including str, int, float, list, and dict). - :param max_len: the maximum length of the output hash (will truncate upto this length). - :param base: the base of the output hash. - Defaults to base56, which encodes the output in a ASCII-chars - :return: SHA256 hash. - """ - - def hash_rec(val, base): - if isinstance(val, list): - return hash_rec(",".join([hash_rec(x, base=base) for x in val]), base=base) - elif isinstance(val, dict): - return hash_rec( - [ - f"{hash_rec(k, base=base)}:{hash_rec(v, base=base)}" - for k, v in sorted(val.items(), key=lambda kv: kv[0]) - ], - base=base, - ) - return cls.convert_integer_to_base_n_str( - int(sha256(str(val).encode("utf8")).hexdigest(), 16), base=base - ) - - return hash_rec(val, base)[:max_len] - - @classmethod - def fuzzy_match( - cls, - string: str, - strings_to_match: Union[str, List[str]], - replacements: Tuple = (SPACE, HYPHEN, SLASH), - repl_char: str = UNDERSCORE, - ) -> Optional[str]: - """Gets the closest fuzzy-matched string from the list, or else returns None.""" - if not isinstance(strings_to_match, list) and not isinstance(strings_to_match, tuple): - assert isinstance(strings_to_match, str), ( - f"Input must be of a string or list of strings; found type " - f"{type(strings_to_match)} with value: {strings_to_match}" - ) - strings_to_match: List[str] = [strings_to_match] - string: str = str(string).lower() - strings_to_match_repl: List[str] = [str(s).lower() for s in strings_to_match] - for repl in replacements: - string: str = string.replace(repl, repl_char) - strings_to_match_repl: List[str] = [s.replace(repl, repl_char) for s in strings_to_match_repl] - for i, s in enumerate(strings_to_match_repl): - if string == s: - return strings_to_match[i] - return None - - @classmethod - def is_fuzzy_match(cls, string: str, strings_to_match: List[str]) -> bool: - """Returns whether or not there is a fuzzy-matched string in the list""" - return cls.fuzzy_match(string, strings_to_match) is not None - - @classmethod - def header(cls, text: str, width: int = 65, border: str = "=") -> str: - out = "" - out += border * width + cls.NEWLINE - out += ("{:^" + str(width) + "s}").format(text) + cls.NEWLINE - out += border * width + cls.NEWLINE - return out - - @classmethod - def is_stream(cls, obj) -> bool: - return isinstance(obj, io.IOBase) and hasattr(obj, "read") - - @classmethod - def pretty(cls, d: Any, max_width: int = 100) -> str: - if isinstance(d, dict): - return pprint.pformat(d, indent=4, width=max_width) - return pprint.pformat(d, width=max_width) - - @classmethod - def dedupe(cls, text: str, dedupe: str) -> str: - while (2 * dedupe) in text: - text: str = text.replace(2 * dedupe, dedupe) - return text - - ## Taken from: https://github.com/moby/moby/blob/0ad2293d0e5bbf4c966a0e8b27c3ac3835265577/pkg/namesgenerator/names-generator.go - RANDOM_NAME_LEFT: List[str] = [ - "admiring", - "adoring", - "affectionate", - "agitated", - "amazing", - "angry", - "awesome", - "beautiful", - "blissful", - "bold", - "boring", - "brave", - "busy", - "charming", - "clever", - "cool", - "compassionate", - "competent", - "condescending", - "confident", - "cranky", - "crazy", - "dazzling", - "determined", - "distracted", - "dreamy", - "eager", - "ecstatic", - "elastic", - "elated", - "elegant", - "eloquent", - "epic", - "exciting", - "fervent", - "festive", - "flamboyant", - "focused", - "friendly", - "frosty", - "funny", - "gallant", - "gifted", - "goofy", - "gracious", - "great", - "happy", - "hardcore", - "heuristic", - "hopeful", - "hungry", - "infallible", - "inspiring", - "interesting", - "intelligent", - "jolly", - "jovial", - "keen", - "kind", - "laughing", - "loving", - "lucid", - "magical", - "mystifying", - "modest", - "musing", - "naughty", - "nervous", - "nice", - "nifty", - "nostalgic", - "objective", - "optimistic", - "peaceful", - "pedantic", - "pensive", - "practical", - "priceless", - "quirky", - "quizzical", - "recursing", - "relaxed", - "reverent", - "romantic", - "sad", - "serene", - "sharp", - "silly", - "sleepy", - "stoic", - "strange", - "stupefied", - "suspicious", - "sweet", - "tender", - "thirsty", - "trusting", - "unruffled", - "upbeat", - "vibrant", - "vigilant", - "vigorous", - "wizardly", - "wonderful", - "xenodochial", - "youthful", - "zealous", - "zen", - ] - RANDOM_NAME_RIGHT: List[str] = [ - "albattani", - "allen", - "almeida", - "antonelli", - "agnesi", - "archimedes", - "ardinghelli", - "aryabhata", - "austin", - "babbage", - "banach", - "banzai", - "bardeen", - "bartik", - "bassi", - "beaver", - "bell", - "benz", - "bhabha", - "bhaskara", - "black", - "blackburn", - "blackwell", - "bohr", - "booth", - "borg", - "bose", - "bouman", - "boyd", - "brahmagupta", - "brattain", - "brown", - "buck", - "burnell", - "cannon", - "carson", - "cartwright", - "carver", - "cerf", - "chandrasekhar", - "chaplygin", - "chatelet", - "chatterjee", - "chebyshev", - "cohen", - "chaum", - "clarke", - "colden", - "cori", - "cray", - "curran", - "curie", - "darwin", - "davinci", - "dewdney", - "dhawan", - "diffie", - "dijkstra", - "dirac", - "driscoll", - "dubinsky", - "easley", - "edison", - "einstein", - "elbakyan", - "elgamal", - "elion", - "ellis", - "engelbart", - "euclid", - "euler", - "faraday", - "feistel", - "fermat", - "fermi", - "feynman", - "franklin", - "gagarin", - "galileo", - "galois", - "ganguly", - "gates", - "gauss", - "germain", - "goldberg", - "goldstine", - "goldwasser", - "golick", - "goodall", - "gould", - "greider", - "grothendieck", - "haibt", - "hamilton", - "haslett", - "hawking", - "hellman", - "heisenberg", - "hermann", - "herschel", - "hertz", - "heyrovsky", - "hodgkin", - "hofstadter", - "hoover", - "hopper", - "hugle", - "hypatia", - "ishizaka", - "jackson", - "jang", - "jemison", - "jennings", - "jepsen", - "johnson", - "joliot", - "jones", - "kalam", - "kapitsa", - "kare", - "keldysh", - "keller", - "kepler", - "khayyam", - "khorana", - "kilby", - "kirch", - "knuth", - "kowalevski", - "lalande", - "lamarr", - "lamport", - "leakey", - "leavitt", - "lederberg", - "lehmann", - "lewin", - "lichterman", - "liskov", - "lovelace", - "lumiere", - "mahavira", - "margulis", - "matsumoto", - "maxwell", - "mayer", - "mccarthy", - "mcclintock", - "mclaren", - "mclean", - "mcnulty", - "mendel", - "mendeleev", - "meitner", - "meninsky", - "merkle", - "mestorf", - "mirzakhani", - "montalcini", - "moore", - "morse", - "murdock", - "moser", - "napier", - "nash", - "neumann", - "newton", - "nightingale", - "nobel", - "noether", - "northcutt", - "noyce", - "panini", - "pare", - "pascal", - "pasteur", - "payne", - "perlman", - "pike", - "poincare", - "poitras", - "proskuriakova", - "ptolemy", - "raman", - "ramanujan", - "ride", - "ritchie", - "rhodes", - "robinson", - "roentgen", - "rosalind", - "rubin", - "saha", - "sammet", - "sanderson", - "satoshi", - "shamir", - "shannon", - "shaw", - "shirley", - "shockley", - "shtern", - "sinoussi", - "snyder", - "solomon", - "spence", - "stonebraker", - "sutherland", - "swanson", - "swartz", - "swirles", - "taussig", - "tereshkova", - "tesla", - "tharp", - "thompson", - "torvalds", - "tu", - "turing", - "varahamihira", - "vaughan", - "visvesvaraya", - "volhard", - "villani", - "wescoff", - "wilbur", - "wiles", - "williams", - "williamson", - "wilson", - "wing", - "wozniak", - "wright", - "wu", - "yalow", - "yonath", - "zhukovsky", - ] - - ## Taken from: https://github.com/mrmaxguns/wonderwordsmodule/tree/master/wonderwords/assets - RANDOM_VERBS: List[str] = [ - "abide", - "accelerate", - "accept", - "accomplish", - "achieve", - "acquire", - "acted", - "activate", - "adapt", - "add", - "address", - "administer", - "admire", - "admit", - "adopt", - "advise", - "afford", - "agree", - "alert", - "alight", - "allow", - "altered", - "amuse", - "analyze", - "announce", - "annoy", - "answer", - "anticipate", - "apologize", - "appear", - "applaud", - "applied", - "appoint", - "appraise", - "appreciate", - "approve", - "arbitrate", - "argue", - "arise", - "arrange", - "arrest", - "arrive", - "ascertain", - "ask", - "assemble", - "assess", - "assist", - "assure", - "attach", - "attack", - "attain", - "attempt", - "attend", - "attract", - "audited", - "avoid", - "awake", - "back", - "bake", - "balance", - "ban", - "bang", - "bare", - "bat", - "bathe", - "battle", - "be", - "beam", - "bear", - "beat", - "become", - "beg", - "begin", - "behave", - "behold", - "belong", - "bend", - "beset", - "bet", - "bid", - "bind", - "bite", - "bleach", - "bleed", - "bless", - "blind", - "blink", - "blot", - "blow", - "blush", - "boast", - "boil", - "bolt", - "bomb", - "book", - "bore", - "borrow", - "bounce", - "bow", - "box", - "brake", - "branch", - "break", - "breathe", - "breed", - "brief", - "bring", - "broadcast", - "bruise", - "brush", - "bubble", - "budget", - "build", - "bump", - "burn", - "burst", - "bury", - "bust", - "buy", - "buze", - "calculate", - "call", - "camp", - "care", - "carry", - "carve", - "cast", - "catalog", - "catch", - "cause", - "challenge", - "change", - "charge", - "chart", - "chase", - "cheat", - "check", - "cheer", - "chew", - "choke", - "choose", - "chop", - "claim", - "clap", - "clarify", - "classify", - "clean", - "clear", - "cling", - "clip", - "close", - "clothe", - "coach", - "coil", - "collect", - "color", - "comb", - "come", - "command", - "communicate", - "compare", - "compete", - "compile", - "complain", - "complete", - "compose", - "compute", - "conceive", - "concentrate", - "conceptualize", - "concern", - "conclude", - "conduct", - "confess", - "confront", - "confuse", - "connect", - "conserve", - "consider", - "consist", - "consolidate", - "construct", - "consult", - "contain", - "continue", - "contract", - "control", - "convert", - "coordinate", - "copy", - "correct", - "correlate", - "cost", - "cough", - "counsel", - "count", - "cover", - "crack", - "crash", - "crawl", - "create", - "creep", - "critique", - "cross", - "crush", - "cry", - "cure", - "curl", - "curve", - "cut", - "cycle", - "dam", - "damage", - "dance", - "dare", - "deal", - "decay", - "deceive", - "decide", - "decorate", - "define", - "delay", - "delegate", - "delight", - "deliver", - "demonstrate", - "depend", - "describe", - "desert", - "deserve", - "design", - "destroy", - "detail", - "detect", - "determine", - "develop", - "devise", - "diagnose", - "dig", - "direct", - "disagree", - "disappear", - "disapprove", - "disarm", - "discover", - "dislike", - "dispense", - "display", - "disprove", - "dissect", - "distribute", - "dive", - "divert", - "divide", - "do", - "double", - "doubt", - "draft", - "drag", - "drain", - "dramatize", - "draw", - "dream", - "dress", - "drink", - "drip", - "drive", - "drop", - "drown", - "drum", - "dry", - "dust", - "dwell", - "earn", - "eat", - "edited", - "educate", - "eliminate", - "embarrass", - "employ", - "empty", - "enacted", - "encourage", - "end", - "endure", - "enforce", - "engineer", - "enhance", - "enjoy", - "enlist", - "ensure", - "enter", - "entertain", - "escape", - "establish", - "estimate", - "evaluate", - "examine", - "exceed", - "excite", - "excuse", - "execute", - "exercise", - "exhibit", - "exist", - "expand", - "expect", - "expedite", - "experiment", - "explain", - "explode", - "express", - "extend", - "extract", - "face", - "facilitate", - "fade", - "fail", - "fancy", - "fasten", - "fax", - "fear", - "feed", - "feel", - "fence", - "fetch", - "fight", - "file", - "fill", - "film", - "finalize", - "finance", - "find", - "fire", - "fit", - "fix", - "flap", - "flash", - "flee", - "fling", - "float", - "flood", - "flow", - "flower", - "fly", - "fold", - "follow", - "fool", - "forbid", - "force", - "forecast", - "forego", - "foresee", - "foretell", - "forget", - "forgive", - "form", - "formulate", - "forsake", - "frame", - "freeze", - "frighten", - "fry", - "gather", - "gaze", - "generate", - "get", - "give", - "glow", - "glue", - "go", - "govern", - "grab", - "graduate", - "grate", - "grease", - "greet", - "grin", - "grind", - "grip", - "groan", - "grow", - "guarantee", - "guard", - "guess", - "guide", - "hammer", - "hand", - "handle", - "handwrite", - "hang", - "happen", - "harass", - "harm", - "hate", - "haunt", - "head", - "heal", - "heap", - "hear", - "heat", - "help", - "hide", - "hit", - "hold", - "hook", - "hop", - "hope", - "hover", - "hug", - "hum", - "hunt", - "hurry", - "hurt", - "hypothesize", - "identify", - "ignore", - "illustrate", - "imagine", - "implement", - "impress", - "improve", - "improvise", - "include", - "increase", - "induce", - "influence", - "inform", - "initiate", - "inject", - "injure", - "inlay", - "innovate", - "input", - "inspect", - "inspire", - "install", - "institute", - "instruct", - "insure", - "integrate", - "intend", - "intensify", - "interest", - "interfere", - "interlay", - "interpret", - "interrupt", - "interview", - "introduce", - "invent", - "inventory", - "investigate", - "invite", - "irritate", - "itch", - "jail", - "jam", - "jog", - "join", - "joke", - "judge", - "juggle", - "jump", - "justify", - "keep", - "kept", - "kick", - "kill", - "kiss", - "kneel", - "knit", - "knock", - "knot", - "know", - "label", - "land", - "last", - "laugh", - "launch", - "lay", - "lead", - "lean", - "leap", - "learn", - "leave", - "lecture", - "led", - "lend", - "let", - "level", - "license", - "lick", - "lie", - "lifted", - "light", - "lighten", - "like", - "list", - "listen", - "live", - "load", - "locate", - "lock", - "log", - "long", - "look", - "lose", - "love", - "maintain", - "make", - "man", - "manage", - "manipulate", - "manufacture", - "map", - "march", - "mark", - "market", - "marry", - "match", - "mate", - "matter", - "mean", - "measure", - "meddle", - "mediate", - "meet", - "melt", - "melt", - "memorize", - "mend", - "mentor", - "milk", - "mine", - "mislead", - "miss", - "misspell", - "mistake", - "misunderstand", - "mix", - "moan", - "model", - "modify", - "monitor", - "moor", - "motivate", - "mourn", - "move", - "mow", - "muddle", - "mug", - "multiply", - "murder", - "nail", - "name", - "navigate", - "need", - "negotiate", - "nest", - "nod", - "nominate", - "normalize", - "note", - "notice", - "number", - "obey", - "object", - "observe", - "obtain", - "occur", - "offend", - "offer", - "officiate", - "open", - "operate", - "order", - "organize", - "oriented", - "originate", - "overcome", - "overdo", - "overdraw", - "overflow", - "overhear", - "overtake", - "overthrow", - "owe", - "own", - "pack", - "paddle", - "paint", - "park", - "part", - "participate", - "pass", - "paste", - "pat", - "pause", - "pay", - "peck", - "pedal", - "peel", - "peep", - "perceive", - "perfect", - "perform", - "permit", - "persuade", - "phone", - "photograph", - "pick", - "pilot", - "pinch", - "pine", - "pinpoint", - "pioneer", - "place", - "plan", - "plant", - "play", - "plead", - "please", - "plug", - "point", - "poke", - "polish", - "pop", - "possess", - "post", - "pour", - "practice", - "praised", - "pray", - "preach", - "precede", - "predict", - "prefer", - "prepare", - "prescribe", - "present", - "preserve", - "preset", - "preside", - "press", - "pretend", - "prevent", - "prick", - "print", - "process", - "procure", - "produce", - "profess", - "program", - "progress", - "project", - "promise", - "promote", - "proofread", - "propose", - "protect", - "prove", - "provide", - "publicize", - "pull", - "pump", - "punch", - "puncture", - "punish", - "purchase", - "push", - "put", - "qualify", - "question", - "queue", - "quit", - "race", - "radiate", - "rain", - "raise", - "rank", - "rate", - "reach", - "read", - "realign", - "realize", - "reason", - "receive", - "recognize", - "recommend", - "reconcile", - "record", - "recruit", - "reduce", - "refer", - "reflect", - "refuse", - "regret", - "regulate", - "rehabilitate", - "reign", - "reinforce", - "reject", - "rejoice", - "relate", - "relax", - "release", - "rely", - "remain", - "remember", - "remind", - "remove", - "render", - "reorganize", - "repair", - "repeat", - "replace", - "reply", - "report", - "represent", - "reproduce", - "request", - "rescue", - "research", - "resolve", - "respond", - "restored", - "restructure", - "retire", - "retrieve", - "return", - "review", - "revise", - "rhyme", - "rid", - "ride", - "ring", - "rinse", - "rise", - "risk", - "rob", - "rock", - "roll", - "rot", - "rub", - "ruin", - "rule", - "run", - "rush", - "sack", - "sail", - "satisfy", - "save", - "saw", - "say", - "scare", - "scatter", - "schedule", - "scold", - "scorch", - "scrape", - "scratch", - "scream", - "screw", - "scribble", - "scrub", - "seal", - "search", - "secure", - "see", - "seek", - "select", - "sell", - "send", - "sense", - "separate", - "serve", - "service", - "set", - "settle", - "sew", - "shade", - "shake", - "shape", - "share", - "shave", - "shear", - "shed", - "shelter", - "shine", - "shiver", - "shock", - "shoe", - "shoot", - "shop", - "show", - "shrink", - "shrug", - "shut", - "sigh", - "sign", - "signal", - "simplify", - "sin", - "sing", - "sink", - "sip", - "sit", - "sketch", - "ski", - "skip", - "slap", - "slay", - "sleep", - "slide", - "sling", - "slink", - "slip", - "slit", - "slow", - "smash", - "smell", - "smile", - "smite", - "smoke", - "snatch", - "sneak", - "sneeze", - "sniff", - "snore", - "snow", - "soak", - "solve", - "soothe", - "soothsay", - "sort", - "sound", - "sow", - "spare", - "spark", - "sparkle", - "speak", - "specify", - "speed", - "spell", - "spend", - "spill", - "spin", - "spit", - "split", - "spoil", - "spot", - "spray", - "spread", - "spring", - "sprout", - "squash", - "squeak", - "squeal", - "squeeze", - "stain", - "stamp", - "stand", - "stare", - "start", - "stay", - "steal", - "steer", - "step", - "stick", - "stimulate", - "sting", - "stink", - "stir", - "stitch", - "stop", - "store", - "strap", - "streamline", - "strengthen", - "stretch", - "stride", - "strike", - "string", - "strip", - "strive", - "stroke", - "structure", - "study", - "stuff", - "sublet", - "subtract", - "succeed", - "suck", - "suffer", - "suggest", - "suit", - "summarize", - "supervise", - "supply", - "support", - "suppose", - "surprise", - "surround", - "suspect", - "suspend", - "swear", - "sweat", - "sweep", - "swell", - "swim", - "swing", - "switch", - "symbolize", - "synthesize", - "systemize", - "tabulate", - "take", - "talk", - "tame", - "tap", - "target", - "taste", - "teach", - "tear", - "tease", - "telephone", - "tell", - "tempt", - "terrify", - "test", - "thank", - "thaw", - "think", - "thrive", - "throw", - "thrust", - "tick", - "tickle", - "tie", - "time", - "tip", - "tire", - "touch", - "tour", - "tow", - "trace", - "trade", - "train", - "transcribe", - "transfer", - "transform", - "translate", - "transport", - "trap", - "travel", - "tread", - "treat", - "tremble", - "trick", - "trip", - "trot", - "trouble", - "troubleshoot", - "trust", - "try", - "tug", - "tumble", - "turn", - "tutor", - "twist", - "type", - "undergo", - "understand", - "undertake", - "undress", - "unfasten", - "unify", - "unite", - "unlock", - "unpack", - "untidy", - "update", - "upgrade", - "uphold", - "upset", - "use", - "utilize", - "vanish", - "verbalize", - "verify", - "vex", - "visit", - "wail", - "wait", - "wake", - "walk", - "wander", - "want", - "warm", - "warn", - "wash", - "waste", - "watch", - "water", - "wave", - "wear", - "weave", - "wed", - "weep", - "weigh", - "welcome", - "wend", - "wet", - "whine", - "whip", - "whirl", - "whisper", - "whistle", - "win", - "wind", - "wink", - "wipe", - "wish", - "withdraw", - "withhold", - "withstand", - "wobble", - "wonder", - "work", - "worry", - "wrap", - "wreck", - "wrestle", - "wriggle", - "wring", - "write", - "x-ray", - "yawn", - "yell", - "zip", - "zoom", - ] - - RANDOM_ADJECTIVES: List[str] = [ - "quizzical", - "highfalutin", - "dynamic", - "wakeful", - "cheerful", - "thoughtful", - "cooperative", - "questionable", - "abundant", - "uneven", - "yummy", - "juicy", - "vacuous", - "concerned", - "young", - "sparkling", - "abhorrent", - "sweltering", - "late", - "macho", - "scrawny", - "friendly", - "kaput", - "divergent", - "busy", - "charming", - "protective", - "premium", - "puzzled", - "waggish", - "rambunctious", - "puffy", - "hard", - "fat", - "sedate", - "yellow", - "resonant", - "dapper", - "courageous", - "vast", - "cool", - "elated", - "wary", - "bewildered", - "level", - "wooden", - "ceaseless", - "tearful", - "cloudy", - "gullible", - "flashy", - "trite", - "quick", - "nondescript", - "round", - "slow", - "spiritual", - "brave", - "tenuous", - "abstracted", - "colossal", - "sloppy", - "obsolete", - "elegant", - "fabulous", - "vivacious", - "exuberant", - "faithful", - "helpless", - "odd", - "sordid", - "blue", - "imported", - "ugly", - "ruthless", - "deeply", - "eminent", - "reminiscent", - "rotten", - "sour", - "volatile", - "succinct", - "judicious", - "abrupt", - "learned", - "stereotyped", - "evanescent", - "efficacious", - "festive", - "loose", - "torpid", - "condemned", - "selective", - "strong", - "momentous", - "ordinary", - "dry", - "great", - "ultra", - "ahead", - "broken", - "dusty", - "piquant", - "creepy", - "miniature", - "periodic", - "equable", - "unsightly", - "narrow", - "grieving", - "whimsical", - "fantastic", - "kindhearted", - "miscreant", - "cowardly", - "cloistered", - "marked", - "bloody", - "chunky", - "undesirable", - "oval", - "nauseating", - "aberrant", - "stingy", - "standing", - "distinct", - "illegal", - "angry", - "faint", - "rustic", - "few", - "calm", - "gorgeous", - "mysterious", - "tacky", - "unadvised", - "greasy", - "minor", - "loving", - "melodic", - "flat", - "wretched", - "clever", - "barbarous", - "pretty", - "endurable", - "handsomely", - "unequaled", - "acceptable", - "symptomatic", - "hurt", - "tested", - "long", - "warm", - "ignorant", - "ashamed", - "excellent", - "known", - "adamant", - "eatable", - "verdant", - "meek", - "unbiased", - "rampant", - "somber", - "cuddly", - "harmonious", - "salty", - "overwrought", - "stimulating", - "beautiful", - "crazy", - "grouchy", - "thirsty", - "joyous", - "confused", - "terrible", - "high", - "unarmed", - "gabby", - "wet", - "sharp", - "wonderful", - "magenta", - "tan", - "huge", - "productive", - "defective", - "chilly", - "needy", - "imminent", - "flaky", - "fortunate", - "neighborly", - "hot", - "husky", - "optimal", - "gaping", - "faulty", - "guttural", - "massive", - "watery", - "abrasive", - "ubiquitous", - "aspiring", - "impartial", - "annoyed", - "billowy", - "lucky", - "panoramic", - "heartbreaking", - "fragile", - "purring", - "wistful", - "burly", - "filthy", - "psychedelic", - "harsh", - "disagreeable", - "ambiguous", - "short", - "splendid", - "crowded", - "light", - "yielding", - "hypnotic", - "dispensable", - "deserted", - "nonchalant", - "green", - "puny", - "deafening", - "classy", - "tall", - "typical", - "exclusive", - "materialistic", - "mute", - "shaky", - "inconclusive", - "rebellious", - "doubtful", - "telling", - "unsuitable", - "woebegone", - "cold", - "sassy", - "arrogant", - "perfect", - "adhesive", - "industrious", - "crabby", - "curly", - "voiceless", - "nostalgic", - "better", - "slippery", - "willing", - "nifty", - "orange", - "victorious", - "ritzy", - "wacky", - "vigorous", - "spotless", - "good", - "powerful", - "bashful", - "soggy", - "grubby", - "moaning", - "placid", - "permissible", - "half", - "towering", - "bawdy", - "measly", - "abaft", - "delightful", - "goofy", - "capricious", - "nonstop", - "addicted", - "acoustic", - "furtive", - "erratic", - "heavy", - "square", - "delicious", - "needless", - "resolute", - "innocent", - "abnormal", - "hurried", - "awful", - "impossible", - "aloof", - "giddy", - "large", - "pointless", - "petite", - "jolly", - "boundless", - "abounding", - "hilarious", - "heavenly", - "honorable", - "squeamish", - "red", - "phobic", - "trashy", - "pathetic", - "parched", - "godly", - "greedy", - "pleasant", - "small", - "aboriginal", - "dashing", - "icky", - "bumpy", - "laughable", - "hapless", - "silent", - "scary", - "shaggy", - "organic", - "unbecoming", - "inexpensive", - "wrong", - "repulsive", - "flawless", - "labored", - "disturbed", - "aboard", - "gusty", - "loud", - "jumbled", - "exotic", - "vulgar", - "threatening", - "belligerent", - "synonymous", - "encouraging", - "fancy", - "embarrassed", - "clumsy", - "fast", - "ethereal", - "chubby", - "high-pitched", - "plastic", - "open", - "straight", - "little", - "ancient", - "fair", - "psychotic", - "murky", - "earthy", - "callous", - "heady", - "lamentable", - "hallowed", - "obtainable", - "toothsome", - "oafish", - "gainful", - "flippant", - "tangy", - "tightfisted", - "damaging", - "utopian", - "gaudy", - "brainy", - "imperfect", - "shiny", - "fanatical", - "snotty", - "relieved", - "shallow", - "foamy", - "parsimonious", - "gruesome", - "elite", - "wide", - "kind", - "bored", - "tangible", - "depressed", - "boring", - "screeching", - "outrageous", - "determined", - "picayune", - "glossy", - "historical", - "staking", - "curious", - "gigantic", - "wandering", - "profuse", - "vengeful", - "glib", - "unaccountable", - "frightened", - "outstanding", - "chivalrous", - "workable", - "modern", - "swanky", - "comfortable", - "gentle", - "substantial", - "brawny", - "curved", - "nebulous", - "boorish", - "afraid", - "fierce", - "efficient", - "lackadaisical", - "recondite", - "internal", - "absorbed", - "squealing", - "frail", - "thundering", - "wanting", - "cooing", - "axiomatic", - "debonair", - "boiling", - "tired", - "numberless", - "flowery", - "mushy", - "enthusiastic", - "proud", - "upset", - "hungry", - "astonishing", - "deadpan", - "prickly", - "mammoth", - "absurd", - "clean", - "jittery", - "wry", - "entertaining", - "literate", - "lying", - "uninterested", - "aquatic", - "super", - "languid", - "cute", - "absorbing", - "scattered", - "brief", - "halting", - "bright", - "fuzzy", - "lethal", - "scarce", - "aggressive", - "obsequious", - "fine", - "giant", - "holistic", - "pastoral", - "stormy", - "quaint", - "nervous", - "wasteful", - "grotesque", - "loutish", - "abiding", - "unable", - "black", - "dysfunctional", - "knowledgeable", - "truculent", - "various", - "luxuriant", - "shrill", - "spiffy", - "guarded", - "colorful", - "misty", - "spurious", - "freezing", - "glamorous", - "famous", - "new", - "instinctive", - "nasty", - "exultant", - "seemly", - "tawdry", - "maniacal", - "wrathful", - "shy", - "nutritious", - "idiotic", - "worried", - "bad", - "stupid", - "ruddy", - "wholesale", - "naughty", - "thoughtless", - "futuristic", - "available", - "slimy", - "cynical", - "fluffy", - "plausible", - "nasty", - "tender", - "changeable", - "smiling", - "oceanic", - "satisfying", - "steadfast", - "ugliest", - "crooked", - "subsequent", - "fascinated", - "woozy", - "teeny", - "quickest", - "moldy", - "uppity", - "sable", - "horrible", - "silly", - "ad hoc", - "numerous", - "berserk", - "wiry", - "knowing", - "lazy", - "childlike", - "zippy", - "fearless", - "pumped", - "weak", - "tacit", - "weary", - "rapid", - "precious", - "smoggy", - "swift", - "lyrical", - "steep", - "quack", - "direful", - "talented", - "hesitant", - "fallacious", - "ill", - "quarrelsome", - "quiet", - "flipped-out", - "didactic", - "fluttering", - "glorious", - "tough", - "sulky", - "elfin", - "abortive", - "sweet", - "habitual", - "supreme", - "hollow", - "possessive", - "inquisitive", - "adjoining", - "incandescent", - "lowly", - "majestic", - "bizarre", - "acrid", - "expensive", - "aback", - "unusual", - "foolish", - "jobless", - "capable", - "damp", - "political", - "dazzling", - "erect", - "Early", - "immense", - "hellish", - "omniscient", - "reflective", - "lovely", - "incompetent", - "empty", - "breakable", - "educated", - "easy", - "devilish", - "assorted", - "decorous", - "jaded", - "homely", - "dangerous", - "adaptable", - "coherent", - "dramatic", - "tense", - "abject", - "fretful", - "troubled", - "diligent", - "solid", - "plain", - "raspy", - "irate", - "offbeat", - "healthy", - "melted", - "cagey", - "many", - "wild", - "venomous", - "animated", - "alike", - "youthful", - "ripe", - "alcoholic", - "sincere", - "teeny-tiny", - "lush", - "defeated", - "zonked", - "foregoing", - "dizzy", - "frantic", - "obnoxious", - "funny", - "damaged", - "grandiose", - "spectacular", - "maddening", - "defiant", - "makeshift", - "strange", - "painstaking", - "merciful", - "madly", - "clammy", - "itchy", - "difficult", - "clear", - "used", - "temporary", - "abandoned", - "null", - "rainy", - "evil", - "alert", - "domineering", - "amuck", - "rabid", - "jealous", - "robust", - "obeisant", - "overt", - "enchanting", - "longing", - "cautious", - "motionless", - "bitter", - "anxious", - "craven", - "breezy", - "ragged", - "skillful", - "quixotic", - "knotty", - "grumpy", - "dark", - "draconian", - "alluring", - "magical", - "versed", - "humdrum", - "accurate", - "ludicrous", - "sleepy", - "envious", - "lavish", - "roasted", - "thinkable", - "overconfident", - "roomy", - "painful", - "wee", - "observant", - "old-fashioned", - "drunk", - "royal", - "likeable", - "adventurous", - "eager", - "obedient", - "attractive", - "x-rated", - "spooky", - "poised", - "righteous", - "excited", - "real", - "abashed", - "womanly", - "ambitious", - "lacking", - "testy", - "big", - "gamy", - "early", - "auspicious", - "blue-eyed", - "discreet", - "nappy", - "vague", - "helpful", - "nosy", - "perpetual", - "disillusioned", - "overrated", - "gleaming", - "tart", - "soft", - "agreeable", - "therapeutic", - "accessible", - "poor", - "gifted", - "old", - "humorous", - "flagrant", - "magnificent", - "alive", - "understood", - "economic", - "mighty", - "ablaze", - "racial", - "tasteful", - "purple", - "broad", - "lean", - "legal", - "witty", - "nutty", - "icy", - "feigned", - "redundant", - "adorable", - "apathetic", - "jumpy", - "scientific", - "combative", - "worthless", - "tasteless", - "voracious", - "jazzy", - "uptight", - "utter", - "hospitable", - "imaginary", - "finicky", - "shocking", - "dead", - "noisy", - "shivering", - "subdued", - "rare", - "zealous", - "demonic", - "ratty", - "snobbish", - "deranged", - "muddy", - "whispering", - "credible", - "hulking", - "fertile", - "tight", - "abusive", - "functional", - "obscene", - "thankful", - "daffy", - "smelly", - "lively", - "homeless", - "secretive", - "amused", - "lewd", - "mere", - "agonizing", - "sad", - "innate", - "sneaky", - "noxious", - "illustrious", - "alleged", - "cultured", - "tame", - "macabre", - "lonely", - "mindless", - "low", - "scintillating", - "statuesque", - "decisive", - "rhetorical", - "hysterical", - "happy", - "earsplitting", - "mundane", - "spicy", - "overjoyed", - "taboo", - "peaceful", - "forgetful", - "elderly", - "upbeat", - "squalid", - "warlike", - "dull", - "plucky", - "handsome", - "groovy", - "absent", - "wise", - "romantic", - "invincible", - "receptive", - "smooth", - "different", - "tiny", - "cruel", - "dirty", - "mature", - "faded", - "tiresome", - "wicked", - "average", - "panicky", - "detailed", - "juvenile", - "scandalous", - "steady", - "wealthy", - "deep", - "sticky", - "jagged", - "wide-eyed", - "tasty", - "disgusted", - "garrulous", - "graceful", - "tranquil", - "annoying", - "hissing", - "noiseless", - "selfish", - "onerous", - "lopsided", - "ossified", - "penitent", - "malicious", - "aromatic", - "successful", - "zany", - "evasive", - "wet", - "naive", - "nice", - "uttermost", - "brash", - "muddled", - "energetic", - "accidental", - "silky", - "guiltless", - "important", - "drab", - "aware", - "skinny", - "careful", - "rightful", - "tricky", - "sore", - "rich", - "blushing", - "stale", - "daily", - "watchful", - "uncovered", - "rough", - "fresh", - "hushed", - "rural", - ] - - RANDOM_NOUNS: List[str] = [ - "aardvark", - "abacus", - "abbey", - "abbreviation", - "abdomen", - "ability", - "abnormality", - "abolishment", - "abrogation", - "absence", - "abundance", - "abuse", - "academics", - "academy", - "accelerant", - "accelerator", - "accent", - "acceptance", - "access", - "accessory", - "accident", - "accommodation", - "accompanist", - "accomplishment", - "accord", - "accordance", - "accordion", - "account", - "accountability", - "accountant", - "accounting", - "accuracy", - "accusation", - "acetate", - "achievement", - "achiever", - "acid", - "acknowledgment", - "acorn", - "acoustics", - "acquaintance", - "acquisition", - "acre", - "acrylic", - "act", - "action", - "activation", - "activist", - "activity", - "actor", - "actress", - "acupuncture", - "ad", - "adaptation", - "adapter", - "addiction", - "addition", - "address", - "adjective", - "adjustment", - "admin", - "administration", - "administrator", - "admire", - "admission", - "adobe", - "adoption", - "adrenalin", - "adrenaline", - "adult", - "adulthood", - "advance", - "advancement", - "advantage", - "advent", - "adverb", - "advertisement", - "advertising", - "advice", - "adviser", - "advocacy", - "advocate", - "affair", - "affect", - "affidavit", - "affiliate", - "affinity", - "afoul", - "afterlife", - "aftermath", - "afternoon", - "aftershave", - "aftershock", - "afterthought", - "age", - "agency", - "agenda", - "agent", - "aggradation", - "aggression", - "aglet", - "agony", - "agreement", - "agriculture", - "aid", - "aide", - "aim", - "air", - "airbag", - "airbus", - "aircraft", - "airfare", - "airfield", - "airforce", - "airline", - "airmail", - "airman", - "airplane", - "airport", - "airship", - "airspace", - "alarm", - "alb", - "albatross", - "album", - "alcohol", - "alcove", - "alder", - "ale", - "alert", - "alfalfa", - "algebra", - "algorithm", - "alias", - "alibi", - "alien", - "allegation", - "allergist", - "alley", - "alliance", - "alligator", - "allocation", - "allowance", - "alloy", - "alluvium", - "almanac", - "almighty", - "almond", - "alpaca", - "alpenglow", - "alpenhorn", - "alpha", - "alphabet", - "altar", - "alteration", - "alternative", - "altitude", - "alto", - "aluminium", - "aluminum", - "amazement", - "amazon", - "ambassador", - "amber", - "ambience", - "ambiguity", - "ambition", - "ambulance", - "amendment", - "amenity", - "ammunition", - "amnesty", - "amount", - "amusement", - "anagram", - "analgesia", - "analog", - "analogue", - "analogy", - "analysis", - "analyst", - "analytics", - "anarchist", - "anarchy", - "anatomy", - "ancestor", - "anchovy", - "android", - "anesthesiologist", - "anesthesiology", - "angel", - "anger", - "angina", - "angiosperm", - "angle", - "angora", - "angstrom", - "anguish", - "animal", - "anime", - "anise", - "ankle", - "anklet", - "anniversary", - "announcement", - "annual", - "anorak", - "answer", - "ant", - "anteater", - "antecedent", - "antechamber", - "antelope", - "antennae", - "anterior", - "anthropology", - "antibody", - "anticipation", - "anticodon", - "antigen", - "antique", - "antiquity", - "antler", - "antling", - "anxiety", - "anybody", - "anyone", - "anything", - "anywhere", - "apartment", - "ape", - "aperitif", - "apology", - "app", - "apparatus", - "apparel", - "appeal", - "appearance", - "appellation", - "appendix", - "appetiser", - "appetite", - "appetizer", - "applause", - "apple", - "applewood", - "appliance", - "application", - "appointment", - "appreciation", - "apprehension", - "approach", - "appropriation", - "approval", - "apricot", - "apron", - "apse", - "aquarium", - "aquifer", - "arcade", - "arch", - "arch-rival", - "archaeologist", - "archaeology", - "archeology", - "archer", - "architect", - "architecture", - "archives", - "area", - "arena", - "argument", - "arithmetic", - "ark", - "arm", - "arm-rest", - "armadillo", - "armament", - "armchair", - "armoire", - "armor", - "armour", - "armpit", - "armrest", - "army", - "arrangement", - "array", - "arrest", - "arrival", - "arrogance", - "arrow", - "art", - "artery", - "arthur", - "artichoke", - "article", - "artifact", - "artificer", - "artist", - "ascend", - "ascent", - "ascot", - "ash", - "ashram", - "ashtray", - "aside", - "asparagus", - "aspect", - "asphalt", - "aspic", - "assassination", - "assault", - "assembly", - "assertion", - "assessment", - "asset", - "assignment", - "assist", - "assistance", - "assistant", - "associate", - "association", - "assumption", - "assurance", - "asterisk", - "astrakhan", - "astrolabe", - "astrologer", - "astrology", - "astronomy", - "asymmetry", - "atelier", - "atheist", - "athlete", - "athletics", - "atmosphere", - "atom", - "atrium", - "attachment", - "attack", - "attacker", - "attainment", - "attempt", - "attendance", - "attendant", - "attention", - "attenuation", - "attic", - "attitude", - "attorney", - "attraction", - "attribute", - "auction", - "audience", - "audit", - "auditorium", - "aunt", - "authentication", - "authenticity", - "author", - "authorisation", - "authority", - "authorization", - "auto", - "autoimmunity", - "automation", - "automaton", - "autumn", - "availability", - "avalanche", - "avenue", - "average", - "avocado", - "award", - "awareness", - "awe", - "axis", - "azimuth", - "babe", - "baboon", - "babushka", - "baby", - "bachelor", - "back", - "back-up", - "backbone", - "backburn", - "backdrop", - "background", - "backpack", - "backup", - "backyard", - "bacon", - "bacterium", - "badge", - "badger", - "bafflement", - "bag", - "bagel", - "baggage", - "baggie", - "baggy", - "bagpipe", - "bail", - "bait", - "bake", - "baker", - "bakery", - "bakeware", - "balaclava", - "balalaika", - "balance", - "balcony", - "ball", - "ballet", - "balloon", - "balloonist", - "ballot", - "ballpark", - "bamboo", - "ban", - "banana", - "band", - "bandana", - "bandanna", - "bandolier", - "bandwidth", - "bangle", - "banjo", - "bank", - "bankbook", - "banker", - "banking", - "bankruptcy", - "banner", - "banquette", - "banyan", - "baobab", - "bar", - "barbecue", - "barbeque", - "barber", - "barbiturate", - "bargain", - "barge", - "baritone", - "barium", - "bark", - "barley", - "barn", - "barometer", - "barracks", - "barrage", - "barrel", - "barrier", - "barstool", - "bartender", - "base", - "baseball", - "baseboard", - "baseline", - "basement", - "basics", - "basil", - "basin", - "basis", - "basket", - "basketball", - "bass", - "bassinet", - "bassoon", - "bat", - "bath", - "bather", - "bathhouse", - "bathrobe", - "bathroom", - "bathtub", - "battalion", - "batter", - "battery", - "batting", - "battle", - "battleship", - "bay", - "bayou", - "beach", - "bead", - "beak", - "beam", - "bean", - "beancurd", - "beanie", - "beanstalk", - "bear", - "beard", - "beast", - "beastie", - "beat", - "beating", - "beauty", - "beaver", - "beck", - "bed", - "bedrock", - "bedroom", - "bee", - "beech", - "beef", - "beer", - "beet", - "beetle", - "beggar", - "beginner", - "beginning", - "begonia", - "behalf", - "behavior", - "behaviour", - "beheading", - "behest", - "behold", - "being", - "belfry", - "belief", - "believer", - "bell", - "belligerency", - "bellows", - "belly", - "belt", - "bench", - "bend", - "beneficiary", - "benefit", - "beret", - "berry", - "best-seller", - "bestseller", - "bet", - "beverage", - "beyond", - "bias", - "bibliography", - "bicycle", - "bid", - "bidder", - "bidding", - "bidet", - "bifocals", - "bijou", - "bike", - "bikini", - "bill", - "billboard", - "billing", - "billion", - "bin", - "binoculars", - "biology", - "biopsy", - "biosphere", - "biplane", - "birch", - "bird", - "bird-watcher", - "birdbath", - "birdcage", - "birdhouse", - "birth", - "birthday", - "biscuit", - "bit", - "bite", - "bitten", - "bitter", - "black", - "blackberry", - "blackbird", - "blackboard", - "blackfish", - "blackness", - "bladder", - "blade", - "blame", - "blank", - "blanket", - "blast", - "blazer", - "blend", - "blessing", - "blight", - "blind", - "blinker", - "blister", - "blizzard", - "block", - "blocker", - "blog", - "blogger", - "blood", - "bloodflow", - "bloom", - "bloomer", - "blossom", - "blouse", - "blow", - "blowgun", - "blowhole", - "blue", - "blueberry", - "blush", - "boar", - "board", - "boat", - "boatload", - "boatyard", - "bob", - "bobcat", - "body", - "bog", - "bolero", - "bolt", - "bomb", - "bomber", - "bombing", - "bond", - "bonding", - "bondsman", - "bone", - "bonfire", - "bongo", - "bonnet", - "bonsai", - "bonus", - "boogeyman", - "book", - "bookcase", - "bookend", - "booking", - "booklet", - "bookmark", - "boolean", - "boom", - "boon", - "boost", - "booster", - "boot", - "bootee", - "bootie", - "booty", - "border", - "bore", - "borrower", - "borrowing", - "bosom", - "boss", - "botany", - "bother", - "bottle", - "bottling", - "bottom", - "bottom-line", - "boudoir", - "bough", - "boulder", - "boulevard", - "boundary", - "bouquet", - "bourgeoisie", - "bout", - "boutique", - "bow", - "bower", - "bowl", - "bowler", - "bowling", - "bowtie", - "box", - "boxer", - "boxspring", - "boy", - "boycott", - "boyfriend", - "boyhood", - "boysenberry", - "bra", - "brace", - "bracelet", - "bracket", - "brain", - "brake", - "bran", - "branch", - "brand", - "brandy", - "brass", - "brassiere", - "bratwurst", - "bread", - "breadcrumb", - "breadfruit", - "break", - "breakdown", - "breakfast", - "breakpoint", - "breakthrough", - "breast", - "breastplate", - "breath", - "breeze", - "brewer", - "bribery", - "brick", - "bricklaying", - "bride", - "bridge", - "brief", - "briefing", - "briefly", - "briefs", - "brilliant", - "brink", - "brisket", - "broad", - "broadcast", - "broccoli", - "brochure", - "brocolli", - "broiler", - "broker", - "bronchitis", - "bronco", - "bronze", - "brooch", - "brood", - "brook", - "broom", - "brother", - "brother-in-law", - "brow", - "brown", - "brownie", - "browser", - "browsing", - "brunch", - "brush", - "brushfire", - "brushing", - "bubble", - "buck", - "bucket", - "buckle", - "buckwheat", - "bud", - "buddy", - "budget", - "buffalo", - "buffer", - "buffet", - "bug", - "buggy", - "bugle", - "builder", - "building", - "bulb", - "bulk", - "bull", - "bull-fighter", - "bulldozer", - "bullet", - "bump", - "bumper", - "bun", - "bunch", - "bungalow", - "bunghole", - "bunkhouse", - "burden", - "bureau", - "burglar", - "burial", - "burlesque", - "burn", - "burn-out", - "burning", - "burrito", - "burro", - "burrow", - "burst", - "bus", - "bush", - "business", - "businessman", - "bust", - "bustle", - "butane", - "butcher", - "butler", - "butter", - "butterfly", - "button", - "buy", - "buyer", - "buying", - "buzz", - "buzzard", - "c-clamp", - "cabana", - "cabbage", - "cabin", - "cabinet", - "cable", - "caboose", - "cacao", - "cactus", - "caddy", - "cadet", - "cafe", - "caffeine", - "caftan", - "cage", - "cake", - "calcification", - "calculation", - "calculator", - "calculus", - "calendar", - "calf", - "caliber", - "calibre", - "calico", - "call", - "calm", - "calorie", - "camel", - "cameo", - "camera", - "camp", - "campaign", - "campaigning", - "campanile", - "camper", - "campus", - "can", - "canal", - "cancer", - "candelabra", - "candidacy", - "candidate", - "candle", - "candy", - "cane", - "cannibal", - "cannon", - "canoe", - "canon", - "canopy", - "cantaloupe", - "canteen", - "canvas", - "cap", - "capability", - "capacity", - "cape", - "caper", - "capital", - "capitalism", - "capitulation", - "capon", - "cappelletti", - "cappuccino", - "captain", - "caption", - "captor", - "car", - "carabao", - "caramel", - "caravan", - "carbohydrate", - "carbon", - "carboxyl", - "card", - "cardboard", - "cardigan", - "care", - "career", - "cargo", - "caribou", - "carload", - "carnation", - "carnival", - "carol", - "carotene", - "carp", - "carpenter", - "carpet", - "carpeting", - "carport", - "carriage", - "carrier", - "carrot", - "carry", - "cart", - "cartel", - "carter", - "cartilage", - "cartload", - "cartoon", - "cartridge", - "carving", - "cascade", - "case", - "casement", - "cash", - "cashew", - "cashier", - "casino", - "casket", - "cassava", - "casserole", - "cassock", - "cast", - "castanet", - "castle", - "casualty", - "cat", - "catacomb", - "catalogue", - "catalysis", - "catalyst", - "catamaran", - "catastrophe", - "catch", - "catcher", - "category", - "caterpillar", - "cathedral", - "cation", - "catsup", - "cattle", - "cauliflower", - "causal", - "cause", - "causeway", - "caution", - "cave", - "caviar", - "cayenne", - "ceiling", - "celebration", - "celebrity", - "celeriac", - "celery", - "cell", - "cellar", - "cello", - "celsius", - "cement", - "cemetery", - "cenotaph", - "census", - "cent", - "center", - "centimeter", - "centre", - "centurion", - "century", - "cephalopod", - "ceramic", - "ceramics", - "cereal", - "ceremony", - "certainty", - "certificate", - "certification", - "cesspool", - "chafe", - "chain", - "chainstay", - "chair", - "chairlift", - "chairman", - "chairperson", - "chaise", - "chalet", - "chalice", - "chalk", - "challenge", - "chamber", - "champagne", - "champion", - "championship", - "chance", - "chandelier", - "change", - "channel", - "chaos", - "chap", - "chapel", - "chaplain", - "chapter", - "character", - "characteristic", - "characterization", - "chard", - "charge", - "charger", - "charity", - "charlatan", - "charm", - "charset", - "chart", - "charter", - "chasm", - "chassis", - "chastity", - "chasuble", - "chateau", - "chatter", - "chauffeur", - "chauvinist", - "check", - "checkbook", - "checking", - "checkout", - "checkroom", - "cheddar", - "cheek", - "cheer", - "cheese", - "cheesecake", - "cheetah", - "chef", - "chem", - "chemical", - "chemistry", - "chemotaxis", - "cheque", - "cherry", - "chess", - "chest", - "chestnut", - "chick", - "chicken", - "chicory", - "chief", - "chiffonier", - "child", - "childbirth", - "childhood", - "chili", - "chill", - "chime", - "chimpanzee", - "chin", - "chinchilla", - "chino", - "chip", - "chipmunk", - "chit-chat", - "chivalry", - "chive", - "chives", - "chocolate", - "choice", - "choir", - "choker", - "cholesterol", - "choosing", - "chop", - "chops", - "chopstick", - "chopsticks", - "chord", - "chorus", - "chow", - "chowder", - "chrome", - "chromolithograph", - "chronicle", - "chronograph", - "chronometer", - "chrysalis", - "chub", - "chuck", - "chug", - "church", - "churn", - "chutney", - "cicada", - "cigarette", - "cilantro", - "cinder", - "cinema", - "cinnamon", - "circadian", - "circle", - "circuit", - "circulation", - "circumference", - "circumstance", - "cirrhosis", - "cirrus", - "citizen", - "citizenship", - "citron", - "citrus", - "city", - "civilian", - "civilisation", - "civilization", - "claim", - "clam", - "clamp", - "clan", - "clank", - "clapboard", - "clarification", - "clarinet", - "clarity", - "clasp", - "class", - "classic", - "classification", - "classmate", - "classroom", - "clause", - "clave", - "clavicle", - "clavier", - "claw", - "clay", - "cleaner", - "clearance", - "clearing", - "cleat", - "cleavage", - "clef", - "cleft", - "clergyman", - "cleric", - "clerk", - "click", - "client", - "cliff", - "climate", - "climb", - "clinic", - "clip", - "clipboard", - "clipper", - "cloak", - "cloakroom", - "clock", - "clockwork", - "clogs", - "cloister", - "clone", - "close", - "closet", - "closing", - "closure", - "cloth", - "clothes", - "clothing", - "cloud", - "cloudburst", - "clove", - "clover", - "cloves", - "club", - "clue", - "cluster", - "clutch", - "co-producer", - "coach", - "coal", - "coalition", - "coast", - "coaster", - "coat", - "cob", - "cobbler", - "cobweb", - "cockpit", - "cockroach", - "cocktail", - "cocoa", - "coconut", - "cod", - "code", - "codepage", - "codling", - "codon", - "codpiece", - "coevolution", - "cofactor", - "coffee", - "coffin", - "cohesion", - "cohort", - "coil", - "coin", - "coincidence", - "coinsurance", - "coke", - "cold", - "coleslaw", - "coliseum", - "collaboration", - "collagen", - "collapse", - "collar", - "collard", - "collateral", - "colleague", - "collection", - "collectivisation", - "collectivization", - "collector", - "college", - "collision", - "colloquy", - "colon", - "colonial", - "colonialism", - "colonisation", - "colonization", - "colony", - "color", - "colorlessness", - "colt", - "column", - "columnist", - "comb", - "combat", - "combination", - "combine", - "comeback", - "comedy", - "comestible", - "comfort", - "comfortable", - "comic", - "comics", - "comma", - "command", - "commander", - "commandment", - "comment", - "commerce", - "commercial", - "commission", - "commitment", - "committee", - "commodity", - "common", - "commonsense", - "commotion", - "communicant", - "communication", - "communion", - "communist", - "community", - "commuter", - "company", - "comparison", - "compass", - "compassion", - "compassionate", - "compensation", - "competence", - "competition", - "competitor", - "complaint", - "complement", - "completion", - "complex", - "complexity", - "compliance", - "complication", - "complicity", - "compliment", - "component", - "comportment", - "composer", - "composite", - "composition", - "compost", - "comprehension", - "compress", - "compromise", - "comptroller", - "compulsion", - "computer", - "comradeship", - "con", - "concentrate", - "concentration", - "concept", - "conception", - "concern", - "concert", - "conclusion", - "concrete", - "condition", - "conditioner", - "condominium", - "condor", - "conduct", - "conductor", - "cone", - "confectionery", - "conference", - "confidence", - "confidentiality", - "configuration", - "confirmation", - "conflict", - "conformation", - "confusion", - "conga", - "congo", - "congregation", - "congress", - "congressman", - "congressperson", - "conifer", - "connection", - "connotation", - "conscience", - "consciousness", - "consensus", - "consent", - "consequence", - "conservation", - "conservative", - "consideration", - "consignment", - "consist", - "consistency", - "console", - "consonant", - "conspiracy", - "conspirator", - "constant", - "constellation", - "constitution", - "constraint", - "construction", - "consul", - "consulate", - "consulting", - "consumer", - "consumption", - "contact", - "contact lens", - "contagion", - "container", - "content", - "contention", - "contest", - "context", - "continent", - "contingency", - "continuity", - "contour", - "contract", - "contractor", - "contrail", - "contrary", - "contrast", - "contribution", - "contributor", - "control", - "controller", - "controversy", - "convection", - "convenience", - "convention", - "conversation", - "conversion", - "convert", - "convertible", - "conviction", - "cook", - "cookbook", - "cookie", - "cooking", - "coonskin", - "cooperation", - "coordination", - "coordinator", - "cop", - "cop-out", - "cope", - "copper", - "copy", - "copying", - "copyright", - "copywriter", - "coral", - "cord", - "corduroy", - "core", - "cork", - "cormorant", - "corn", - "corner", - "cornerstone", - "cornet", - "cornflakes", - "cornmeal", - "corporal", - "corporation", - "corporatism", - "corps", - "corral", - "correspondence", - "correspondent", - "corridor", - "corruption", - "corsage", - "cosset", - "cost", - "costume", - "cot", - "cottage", - "cotton", - "couch", - "cougar", - "cough", - "council", - "councilman", - "councilor", - "councilperson", - "counsel", - "counseling", - "counselling", - "counsellor", - "counselor", - "count", - "counter", - "counter-force", - "counterpart", - "counterterrorism", - "countess", - "country", - "countryside", - "county", - "couple", - "coupon", - "courage", - "course", - "court", - "courthouse", - "courtroom", - "cousin", - "covariate", - "cover", - "coverage", - "coverall", - "cow", - "cowbell", - "cowboy", - "coyote", - "crab", - "crack", - "cracker", - "crackers", - "cradle", - "craft", - "craftsman", - "cranberry", - "crane", - "cranky", - "crash", - "crate", - "cravat", - "craw", - "crawdad", - "crayfish", - "crayon", - "crazy", - "cream", - "creation", - "creationism", - "creationist", - "creative", - "creativity", - "creator", - "creature", - "creche", - "credential", - "credenza", - "credibility", - "credit", - "creditor", - "creek", - "creme brulee", - "crepe", - "crest", - "crew", - "crewman", - "crewmate", - "crewmember", - "crewmen", - "cria", - "crib", - "cribbage", - "cricket", - "cricketer", - "crime", - "criminal", - "crinoline", - "crisis", - "crisp", - "criteria", - "criterion", - "critic", - "criticism", - "crocodile", - "crocus", - "croissant", - "crook", - "crop", - "cross", - "cross-contamination", - "cross-stitch", - "crotch", - "croup", - "crow", - "crowd", - "crown", - "crucifixion", - "crude", - "cruelty", - "cruise", - "crumb", - "crunch", - "crusader", - "crush", - "crust", - "cry", - "crystal", - "crystallography", - "cub", - "cube", - "cuckoo", - "cucumber", - "cue", - "cuff-link", - "cuisine", - "cultivar", - "cultivator", - "culture", - "culvert", - "cummerbund", - "cup", - "cupboard", - "cupcake", - "cupola", - "curd", - "cure", - "curio", - "curiosity", - "curl", - "curler", - "currant", - "currency", - "current", - "curriculum", - "curry", - "curse", - "cursor", - "curtailment", - "curtain", - "curve", - "cushion", - "custard", - "custody", - "custom", - "customer", - "cut", - "cuticle", - "cutlet", - "cutover", - "cutting", - "cyclamen", - "cycle", - "cyclone", - "cyclooxygenase", - "cygnet", - "cylinder", - "cymbal", - "cynic", - "cyst", - "cytokine", - "cytoplasm", - "dad", - "daddy", - "daffodil", - "dagger", - "dahlia", - "daikon", - "daily", - "dairy", - "daisy", - "dam", - "damage", - "dame", - "dance", - "dancer", - "dancing", - "dandelion", - "danger", - "dare", - "dark", - "darkness", - "darn", - "dart", - "dash", - "dashboard", - "data", - "database", - "date", - "daughter", - "dawn", - "day", - "daybed", - "daylight", - "dead", - "deadline", - "deal", - "dealer", - "dealing", - "dearest", - "death", - "deathwatch", - "debate", - "debris", - "debt", - "debtor", - "decade", - "decadence", - "decency", - "decimal", - "decision", - "decision-making", - "deck", - "declaration", - "declination", - "decline", - "decoder", - "decongestant", - "decoration", - "decrease", - "decryption", - "dedication", - "deduce", - "deduction", - "deed", - "deep", - "deer", - "default", - "defeat", - "defendant", - "defender", - "defense", - "deficit", - "definition", - "deformation", - "degradation", - "degree", - "delay", - "deliberation", - "delight", - "delivery", - "demand", - "democracy", - "democrat", - "demon", - "demur", - "den", - "denim", - "denominator", - "density", - "dentist", - "deodorant", - "department", - "departure", - "dependency", - "dependent", - "deployment", - "deposit", - "deposition", - "depot", - "depression", - "depressive", - "depth", - "deputy", - "derby", - "derivation", - "derivative", - "derrick", - "descendant", - "descent", - "description", - "desert", - "design", - "designation", - "designer", - "desire", - "desk", - "desktop", - "dessert", - "destination", - "destiny", - "destroyer", - "destruction", - "detail", - "detainee", - "detainment", - "detection", - "detective", - "detector", - "detention", - "determination", - "detour", - "devastation", - "developer", - "developing", - "development", - "developmental", - "deviance", - "deviation", - "device", - "devil", - "dew", - "dhow", - "diabetes", - "diadem", - "diagnosis", - "diagram", - "dial", - "dialect", - "dialogue", - "diam", - "diamond", - "diaper", - "diaphragm", - "diarist", - "diary", - "dibble", - "dickey", - "dictaphone", - "dictator", - "diction", - "dictionary", - "die", - "diesel", - "diet", - "difference", - "differential", - "difficulty", - "diffuse", - "dig", - "digestion", - "digestive", - "digger", - "digging", - "digit", - "dignity", - "dilapidation", - "dill", - "dilution", - "dime", - "dimension", - "dimple", - "diner", - "dinghy", - "dining", - "dinner", - "dinosaur", - "dioxide", - "dip", - "diploma", - "diplomacy", - "dipstick", - "direction", - "directive", - "director", - "directory", - "dirndl", - "dirt", - "disability", - "disadvantage", - "disagreement", - "disappointment", - "disarmament", - "disaster", - "discharge", - "discipline", - "disclaimer", - "disclosure", - "disco", - "disconnection", - "discount", - "discourse", - "discovery", - "discrepancy", - "discretion", - "discrimination", - "discussion", - "disdain", - "disease", - "disembodiment", - "disengagement", - "disguise", - "disgust", - "dish", - "dishwasher", - "disk", - "disparity", - "dispatch", - "displacement", - "display", - "disposal", - "disposer", - "disposition", - "dispute", - "disregard", - "disruption", - "dissemination", - "dissonance", - "distance", - "distinction", - "distortion", - "distribution", - "distributor", - "district", - "divalent", - "divan", - "diver", - "diversity", - "divide", - "dividend", - "divider", - "divine", - "diving", - "division", - "divorce", - "doc", - "dock", - "doctor", - "doctorate", - "doctrine", - "document", - "documentary", - "documentation", - "doe", - "dog", - "doggie", - "dogsled", - "dogwood", - "doing", - "doll", - "dollar", - "dollop", - "dolman", - "dolor", - "dolphin", - "domain", - "dome", - "domination", - "donation", - "donkey", - "donor", - "donut", - "door", - "doorbell", - "doorknob", - "doorpost", - "doorway", - "dory", - "dose", - "dot", - "double", - "doubling", - "doubt", - "doubter", - "dough", - "doughnut", - "down", - "downfall", - "downforce", - "downgrade", - "download", - "downstairs", - "downtown", - "downturn", - "dozen", - "draft", - "drag", - "dragon", - "dragonfly", - "dragonfruit", - "dragster", - "drain", - "drainage", - "drake", - "drama", - "dramaturge", - "drapes", - "draw", - "drawbridge", - "drawer", - "drawing", - "dream", - "dreamer", - "dredger", - "dress", - "dresser", - "dressing", - "drill", - "drink", - "drinking", - "drive", - "driver", - "driveway", - "driving", - "drizzle", - "dromedary", - "drop", - "drudgery", - "drug", - "drum", - "drummer", - "drunk", - "dryer", - "duck", - "duckling", - "dud", - "dude", - "due", - "duel", - "dueling", - "duffel", - "dugout", - "dulcimer", - "dumbwaiter", - "dump", - "dump truck", - "dune", - "dune buggy", - "dungarees", - "dungeon", - "duplexer", - "duration", - "durian", - "dusk", - "dust", - "dust storm", - "duster", - "duty", - "dwarf", - "dwell", - "dwelling", - "dynamics", - "dynamite", - "dynamo", - "dynasty", - "dysfunction", - "e-book", - "e-mail", - "e-reader", - "eagle", - "eaglet", - "ear", - "eardrum", - "earmuffs", - "earnings", - "earplug", - "earring", - "earrings", - "earth", - "earthquake", - "earthworm", - "ease", - "easel", - "east", - "eating", - "eaves", - "eavesdropper", - "ecclesia", - "echidna", - "eclipse", - "ecliptic", - "ecology", - "economics", - "economy", - "ecosystem", - "ectoderm", - "ectodermal", - "ecumenist", - "eddy", - "edge", - "edger", - "edible", - "editing", - "edition", - "editor", - "editorial", - "education", - "eel", - "effacement", - "effect", - "effective", - "effectiveness", - "effector", - "efficacy", - "efficiency", - "effort", - "egg", - "egghead", - "eggnog", - "eggplant", - "ego", - "eicosanoid", - "ejector", - "elbow", - "elderberry", - "election", - "electricity", - "electrocardiogram", - "electronics", - "element", - "elephant", - "elevation", - "elevator", - "eleventh", - "elf", - "elicit", - "eligibility", - "elimination", - "elite", - "elixir", - "elk", - "ellipse", - "elm", - "elongation", - "elver", - "email", - "emanate", - "embarrassment", - "embassy", - "embellishment", - "embossing", - "embryo", - "emerald", - "emergence", - "emergency", - "emergent", - "emery", - "emission", - "emitter", - "emotion", - "emphasis", - "empire", - "employ", - "employee", - "employer", - "employment", - "empowerment", - "emu", - "enactment", - "encirclement", - "enclave", - "enclosure", - "encounter", - "encouragement", - "encyclopedia", - "end", - "endive", - "endoderm", - "endorsement", - "endothelium", - "endpoint", - "enemy", - "energy", - "enforcement", - "engagement", - "engine", - "engineer", - "engineering", - "enigma", - "enjoyment", - "enquiry", - "enrollment", - "enterprise", - "entertainment", - "enthusiasm", - "entirety", - "entity", - "entrance", - "entree", - "entrepreneur", - "entry", - "envelope", - "environment", - "envy", - "enzyme", - "epauliere", - "epee", - "ephemera", - "ephemeris", - "ephyra", - "epic", - "episode", - "epithelium", - "epoch", - "eponym", - "epoxy", - "equal", - "equality", - "equation", - "equinox", - "equipment", - "equity", - "equivalent", - "era", - "eraser", - "erection", - "erosion", - "error", - "escalator", - "escape", - "escort", - "espadrille", - "espalier", - "essay", - "essence", - "essential", - "establishment", - "estate", - "estimate", - "estrogen", - "estuary", - "eternity", - "ethernet", - "ethics", - "ethnicity", - "ethyl", - "euphonium", - "eurocentrism", - "evaluation", - "evaluator", - "evaporation", - "eve", - "evening", - "evening-wear", - "event", - "everybody", - "everyone", - "everything", - "eviction", - "evidence", - "evil", - "evocation", - "evolution", - "ex-husband", - "ex-wife", - "exaggeration", - "exam", - "examination", - "examiner", - "example", - "exasperation", - "excellence", - "exception", - "excerpt", - "excess", - "exchange", - "excitement", - "exclamation", - "excursion", - "excuse", - "execution", - "executive", - "executor", - "exercise", - "exhaust", - "exhaustion", - "exhibit", - "exhibition", - "exile", - "existence", - "exit", - "exocrine", - "expansion", - "expansionism", - "expectancy", - "expectation", - "expedition", - "expense", - "experience", - "experiment", - "experimentation", - "expert", - "expertise", - "explanation", - "exploration", - "explorer", - "explosion", - "export", - "expose", - "exposition", - "exposure", - "expression", - "extension", - "extent", - "exterior", - "external", - "extinction", - "extreme", - "extremist", - "eye", - "eyeball", - "eyebrow", - "eyebrows", - "eyeglasses", - "eyelash", - "eyelashes", - "eyelid", - "eyelids", - "eyeliner", - "eyestrain", - "eyrie", - "fabric", - "face", - "facelift", - "facet", - "facility", - "facsimile", - "fact", - "factor", - "factory", - "faculty", - "fahrenheit", - "fail", - "failure", - "fairness", - "fairy", - "faith", - "faithful", - "fall", - "fallacy", - "falling-out", - "fame", - "familiar", - "familiarity", - "family", - "fan", - "fang", - "fanlight", - "fanny-pack", - "fantasy", - "farm", - "farmer", - "farming", - "farmland", - "farrow", - "fascia", - "fashion", - "fat", - "fate", - "father", - "father-in-law", - "fatigue", - "fatigues", - "faucet", - "fault", - "fav", - "fava", - "favor", - "favorite", - "fawn", - "fax", - "fear", - "feast", - "feather", - "feature", - "fedelini", - "federation", - "fedora", - "fee", - "feed", - "feedback", - "feeding", - "feel", - "feeling", - "fellow", - "felony", - "female", - "fen", - "fence", - "fencing", - "fender", - "feng", - "fennel", - "ferret", - "ferry", - "ferryboat", - "fertilizer", - "festival", - "fetus", - "few", - "fiber", - "fiberglass", - "fibre", - "fibroblast", - "fibrosis", - "ficlet", - "fiction", - "fiddle", - "field", - "fiery", - "fiesta", - "fifth", - "fig", - "fight", - "fighter", - "figure", - "figurine", - "file", - "filing", - "fill", - "fillet", - "filly", - "film", - "filter", - "filth", - "final", - "finance", - "financing", - "finding", - "fine", - "finer", - "finger", - "fingerling", - "fingernail", - "finish", - "finisher", - "fir", - "fire", - "fireman", - "fireplace", - "firewall", - "firm", - "first", - "fish", - "fishbone", - "fisherman", - "fishery", - "fishing", - "fishmonger", - "fishnet", - "fisting", - "fit", - "fitness", - "fix", - "fixture", - "flag", - "flair", - "flame", - "flan", - "flanker", - "flare", - "flash", - "flat", - "flatboat", - "flavor", - "flax", - "fleck", - "fledgling", - "fleece", - "flesh", - "flexibility", - "flick", - "flicker", - "flight", - "flint", - "flintlock", - "flip-flops", - "flock", - "flood", - "floodplain", - "floor", - "floozie", - "flour", - "flow", - "flower", - "flu", - "flugelhorn", - "fluke", - "flume", - "flung", - "flute", - "fly", - "flytrap", - "foal", - "foam", - "fob", - "focus", - "fog", - "fold", - "folder", - "folk", - "folklore", - "follower", - "following", - "fondue", - "font", - "food", - "foodstuffs", - "fool", - "foot", - "footage", - "football", - "footnote", - "footprint", - "footrest", - "footstep", - "footstool", - "footwear", - "forage", - "forager", - "foray", - "force", - "ford", - "forearm", - "forebear", - "forecast", - "forehead", - "foreigner", - "forelimb", - "forest", - "forestry", - "forever", - "forgery", - "fork", - "form", - "formal", - "formamide", - "format", - "formation", - "former", - "formicarium", - "formula", - "fort", - "forte", - "fortnight", - "fortress", - "fortune", - "forum", - "foundation", - "founder", - "founding", - "fountain", - "fourths", - "fowl", - "fox", - "foxglove", - "fraction", - "fragrance", - "frame", - "framework", - "fratricide", - "fraud", - "fraudster", - "freak", - "freckle", - "freedom", - "freelance", - "freezer", - "freezing", - "freight", - "freighter", - "frenzy", - "freon", - "frequency", - "fresco", - "friction", - "fridge", - "friend", - "friendship", - "fries", - "frigate", - "fright", - "fringe", - "fritter", - "frock", - "frog", - "front", - "frontier", - "frost", - "frosting", - "frown", - "fruit", - "frustration", - "fry", - "fuel", - "fugato", - "fulfillment", - "full", - "fun", - "function", - "functionality", - "fund", - "funding", - "fundraising", - "funeral", - "fur", - "furnace", - "furniture", - "furry", - "fusarium", - "futon", - "future", - "gadget", - "gaffe", - "gaffer", - "gain", - "gaiters", - "gale", - "gall-bladder", - "gallery", - "galley", - "gallon", - "galoshes", - "gambling", - "game", - "gamebird", - "gaming", - "gamma-ray", - "gander", - "gang", - "gap", - "garage", - "garb", - "garbage", - "garden", - "garlic", - "garment", - "garter", - "gas", - "gasket", - "gasoline", - "gasp", - "gastronomy", - "gastropod", - "gate", - "gateway", - "gather", - "gathering", - "gator", - "gauge", - "gauntlet", - "gavel", - "gazebo", - "gazelle", - "gear", - "gearshift", - "geek", - "gel", - "gelatin", - "gelding", - "gem", - "gemsbok", - "gender", - "gene", - "general", - "generation", - "generator", - "generosity", - "genetics", - "genie", - "genius", - "genocide", - "genre", - "gentleman", - "geography", - "geology", - "geometry", - "geranium", - "gerbil", - "gesture", - "geyser", - "gherkin", - "ghost", - "giant", - "gift", - "gig", - "gigantism", - "giggle", - "ginger", - "gingerbread", - "ginseng", - "giraffe", - "girdle", - "girl", - "girlfriend", - "git", - "glacier", - "gladiolus", - "glance", - "gland", - "glass", - "glasses", - "glee", - "glen", - "glider", - "gliding", - "glimpse", - "globe", - "glockenspiel", - "gloom", - "glory", - "glove", - "glow", - "glucose", - "glue", - "glut", - "glutamate", - "gnat", - "gnu", - "go-kart", - "goal", - "goat", - "gobbler", - "god", - "goddess", - "godfather", - "godmother", - "godparent", - "goggles", - "going", - "gold", - "goldfish", - "golf", - "gondola", - "gong", - "good", - "good-bye", - "goodbye", - "goodie", - "goodness", - "goodnight", - "goodwill", - "goose", - "gopher", - "gorilla", - "gosling", - "gossip", - "governance", - "government", - "governor", - "gown", - "grab-bag", - "grace", - "grade", - "gradient", - "graduate", - "graduation", - "graffiti", - "graft", - "grain", - "gram", - "grammar", - "gran", - "grand", - "grandchild", - "granddaughter", - "grandfather", - "grandma", - "grandmom", - "grandmother", - "grandpa", - "grandparent", - "grandson", - "granny", - "granola", - "grant", - "grape", - "grapefruit", - "graph", - "graphic", - "grasp", - "grass", - "grasshopper", - "grassland", - "gratitude", - "gravel", - "gravitas", - "gravity", - "gravy", - "gray", - "grease", - "great-grandfather", - "great-grandmother", - "greatness", - "greed", - "green", - "greenhouse", - "greens", - "grenade", - "grey", - "grid", - "grief", - "grill", - "grin", - "grip", - "gripper", - "grit", - "grocery", - "ground", - "group", - "grouper", - "grouse", - "grove", - "growth", - "grub", - "guacamole", - "guarantee", - "guard", - "guava", - "guerrilla", - "guess", - "guest", - "guestbook", - "guidance", - "guide", - "guideline", - "guilder", - "guilt", - "guilty", - "guinea", - "guitar", - "guitarist", - "gum", - "gumshoe", - "gun", - "gunpowder", - "gutter", - "guy", - "gym", - "gymnast", - "gymnastics", - "gynaecology", - "gyro", - "habit", - "habitat", - "hacienda", - "hacksaw", - "hackwork", - "hail", - "hair", - "haircut", - "hake", - "half", - "half-brother", - "half-sister", - "halibut", - "hall", - "halloween", - "hallway", - "halt", - "ham", - "hamburger", - "hammer", - "hammock", - "hamster", - "hand", - "hand-holding", - "handball", - "handful", - "handgun", - "handicap", - "handle", - "handlebar", - "handmaiden", - "handover", - "handrail", - "handsaw", - "hanger", - "happening", - "happiness", - "harald", - "harbor", - "harbour", - "hard-hat", - "hardboard", - "hardcover", - "hardening", - "hardhat", - "hardship", - "hardware", - "hare", - "harm", - "harmonica", - "harmonise", - "harmonize", - "harmony", - "harp", - "harpooner", - "harpsichord", - "harvest", - "harvester", - "hash", - "hashtag", - "hassock", - "haste", - "hat", - "hatbox", - "hatchet", - "hatchling", - "hate", - "hatred", - "haunt", - "haven", - "haversack", - "havoc", - "hawk", - "hay", - "haze", - "hazel", - "hazelnut", - "head", - "headache", - "headlight", - "headline", - "headphones", - "headquarters", - "headrest", - "health", - "health-care", - "hearing", - "hearsay", - "heart", - "heart-throb", - "heartache", - "heartbeat", - "hearth", - "hearthside", - "heartwood", - "heat", - "heater", - "heating", - "heaven", - "heavy", - "hectare", - "hedge", - "hedgehog", - "heel", - "heifer", - "height", - "heir", - "heirloom", - "helicopter", - "helium", - "hellcat", - "hello", - "helmet", - "helo", - "help", - "hemisphere", - "hemp", - "hen", - "hepatitis", - "herb", - "herbs", - "heritage", - "hermit", - "hero", - "heroine", - "heron", - "herring", - "hesitation", - "heterosexual", - "hexagon", - "heyday", - "hiccups", - "hide", - "hierarchy", - "high", - "high-rise", - "highland", - "highlight", - "highway", - "hike", - "hiking", - "hill", - "hint", - "hip", - "hippodrome", - "hippopotamus", - "hire", - "hiring", - "historian", - "history", - "hit", - "hive", - "hobbit", - "hobby", - "hockey", - "hoe", - "hog", - "hold", - "holder", - "hole", - "holiday", - "home", - "homeland", - "homeownership", - "hometown", - "homework", - "homicide", - "homogenate", - "homonym", - "homosexual", - "homosexuality", - "honesty", - "honey", - "honeybee", - "honeydew", - "honor", - "honoree", - "hood", - "hoof", - "hook", - "hop", - "hope", - "hops", - "horde", - "horizon", - "hormone", - "horn", - "hornet", - "horror", - "horse", - "horseradish", - "horst", - "hose", - "hosiery", - "hospice", - "hospital", - "hospitalisation", - "hospitality", - "hospitalization", - "host", - "hostel", - "hostess", - "hotdog", - "hotel", - "hound", - "hour", - "hourglass", - "house", - "houseboat", - "household", - "housewife", - "housework", - "housing", - "hovel", - "hovercraft", - "howard", - "howitzer", - "hub", - "hubcap", - "hubris", - "hug", - "hugger", - "hull", - "human", - "humanity", - "humidity", - "hummus", - "humor", - "humour", - "hunchback", - "hundred", - "hunger", - "hunt", - "hunter", - "hunting", - "hurdle", - "hurdler", - "hurricane", - "hurry", - "hurt", - "husband", - "hut", - "hutch", - "hyacinth", - "hybridisation", - "hybridization", - "hydrant", - "hydraulics", - "hydrocarb", - "hydrocarbon", - "hydrofoil", - "hydrogen", - "hydrolyse", - "hydrolysis", - "hydrolyze", - "hydroxyl", - "hyena", - "hygienic", - "hype", - "hyphenation", - "hypochondria", - "hypothermia", - "hypothesis", - "ice", - "ice-cream", - "iceberg", - "icebreaker", - "icecream", - "icicle", - "icing", - "icon", - "icy", - "id", - "idea", - "ideal", - "identification", - "identity", - "ideology", - "idiom", - "igloo", - "ignorance", - "ignorant", - "ikebana", - "illegal", - "illiteracy", - "illness", - "illusion", - "illustration", - "image", - "imagination", - "imbalance", - "imitation", - "immigrant", - "immigration", - "immortal", - "impact", - "impairment", - "impala", - "impediment", - "implement", - "implementation", - "implication", - "import", - "importance", - "impostor", - "impress", - "impression", - "imprisonment", - "impropriety", - "improvement", - "impudence", - "impulse", - "in-joke", - "in-laws", - "inability", - "inauguration", - "inbox", - "incandescence", - "incarnation", - "incense", - "incentive", - "inch", - "incidence", - "incident", - "incision", - "inclusion", - "income", - "incompetence", - "inconvenience", - "increase", - "incubation", - "independence", - "independent", - "index", - "indication", - "indicator", - "indigence", - "individual", - "industrialisation", - "industrialization", - "industry", - "inequality", - "inevitable", - "infancy", - "infant", - "infarction", - "infection", - "infiltration", - "infinite", - "infix", - "inflammation", - "inflation", - "influence", - "influx", - "info", - "information", - "infrastructure", - "infusion", - "inglenook", - "ingrate", - "ingredient", - "inhabitant", - "inheritance", - "inhibition", - "inhibitor", - "initial", - "initialise", - "initialize", - "initiative", - "injunction", - "injury", - "injustice", - "ink", - "inlay", - "inn", - "innervation", - "innocence", - "innocent", - "innovation", - "input", - "inquiry", - "inscription", - "insect", - "insectarium", - "insert", - "inside", - "insight", - "insolence", - "insomnia", - "inspection", - "inspector", - "inspiration", - "installation", - "instance", - "instant", - "instinct", - "institute", - "institution", - "instruction", - "instructor", - "instrument", - "instrumentalist", - "instrumentation", - "insulation", - "insurance", - "insurgence", - "insurrection", - "integer", - "integral", - "integration", - "integrity", - "intellect", - "intelligence", - "intensity", - "intent", - "intention", - "intentionality", - "interaction", - "interchange", - "interconnection", - "intercourse", - "interest", - "interface", - "interferometer", - "interior", - "interject", - "interloper", - "internet", - "interpretation", - "interpreter", - "interval", - "intervenor", - "intervention", - "interview", - "interviewer", - "intestine", - "introduction", - "intuition", - "invader", - "invasion", - "invention", - "inventor", - "inventory", - "inverse", - "inversion", - "investigation", - "investigator", - "investment", - "investor", - "invitation", - "invite", - "invoice", - "involvement", - "iridescence", - "iris", - "iron", - "ironclad", - "irony", - "irrigation", - "ischemia", - "island", - "isogloss", - "isolation", - "issue", - "item", - "itinerary", - "ivory", - "jack", - "jackal", - "jacket", - "jackfruit", - "jade", - "jaguar", - "jail", - "jailhouse", - "jalapeño", - "jam", - "jar", - "jasmine", - "jaw", - "jazz", - "jealousy", - "jeans", - "jeep", - "jelly", - "jellybeans", - "jellyfish", - "jerk", - "jet", - "jewel", - "jeweller", - "jewellery", - "jewelry", - "jicama", - "jiffy", - "job", - "jockey", - "jodhpurs", - "joey", - "jogging", - "joint", - "joke", - "jot", - "journal", - "journalism", - "journalist", - "journey", - "joy", - "judge", - "judgment", - "judo", - "jug", - "juggernaut", - "juice", - "julienne", - "jumbo", - "jump", - "jumper", - "jumpsuit", - "jungle", - "junior", - "junk", - "junker", - "junket", - "jury", - "justice", - "justification", - "jute", - "kale", - "kamikaze", - "kangaroo", - "karate", - "kayak", - "kazoo", - "kebab", - "keep", - "keeper", - "kendo", - "kennel", - "ketch", - "ketchup", - "kettle", - "kettledrum", - "key", - "keyboard", - "keyboarding", - "keystone", - "kick", - "kick-off", - "kid", - "kidney", - "kielbasa", - "kill", - "killer", - "killing", - "kilogram", - "kilometer", - "kilt", - "kimono", - "kinase", - "kind", - "kindness", - "king", - "kingdom", - "kingfish", - "kiosk", - "kiss", - "kit", - "kitchen", - "kite", - "kitsch", - "kitten", - "kitty", - "kiwi", - "knee", - "kneejerk", - "knickers", - "knife", - "knife-edge", - "knight", - "knitting", - "knock", - "knot", - "know-how", - "knowledge", - "knuckle", - "koala", - "kohlrabi", - "kumquat", - "lab", - "label", - "labor", - "laboratory", - "laborer", - "labour", - "labourer", - "lace", - "lack", - "lacquerware", - "lad", - "ladder", - "ladle", - "lady", - "ladybug", - "lag", - "lake", - "lamb", - "lambkin", - "lament", - "lamp", - "lanai", - "land", - "landform", - "landing", - "landmine", - "landscape", - "lane", - "language", - "lantern", - "lap", - "laparoscope", - "lapdog", - "laptop", - "larch", - "lard", - "larder", - "lark", - "larva", - "laryngitis", - "lasagna", - "lashes", - "last", - "latency", - "latex", - "lathe", - "latitude", - "latte", - "latter", - "laugh", - "laughter", - "laundry", - "lava", - "law", - "lawmaker", - "lawn", - "lawsuit", - "lawyer", - "lay", - "layer", - "layout", - "lead", - "leader", - "leadership", - "leading", - "leaf", - "league", - "leaker", - "leap", - "learning", - "leash", - "leather", - "leave", - "leaver", - "lecture", - "leek", - "leeway", - "left", - "leg", - "legacy", - "legal", - "legend", - "legging", - "legislation", - "legislator", - "legislature", - "legitimacy", - "legume", - "leisure", - "lemon", - "lemonade", - "lemur", - "lender", - "lending", - "length", - "lens", - "lentil", - "leopard", - "leprosy", - "leptocephalus", - "lesbian", - "lesson", - "letter", - "lettuce", - "level", - "lever", - "leverage", - "leveret", - "liability", - "liar", - "liberty", - "libido", - "library", - "licence", - "license", - "licensing", - "licorice", - "lid", - "lie", - "lieu", - "lieutenant", - "life", - "lifestyle", - "lifetime", - "lift", - "ligand", - "light", - "lighting", - "lightning", - "lightscreen", - "ligula", - "likelihood", - "likeness", - "lilac", - "lily", - "limb", - "lime", - "limestone", - "limit", - "limitation", - "limo", - "line", - "linen", - "liner", - "linguist", - "linguistics", - "lining", - "link", - "linkage", - "linseed", - "lion", - "lip", - "lipid", - "lipoprotein", - "lipstick", - "liquid", - "liquidity", - "liquor", - "list", - "listening", - "listing", - "literate", - "literature", - "litigation", - "litmus", - "litter", - "littleneck", - "liver", - "livestock", - "living", - "lizard", - "llama", - "load", - "loading", - "loaf", - "loafer", - "loan", - "lobby", - "lobotomy", - "lobster", - "local", - "locality", - "location", - "lock", - "locker", - "locket", - "locomotive", - "locust", - "lode", - "loft", - "log", - "loggia", - "logic", - "login", - "logistics", - "logo", - "loincloth", - "lollipop", - "loneliness", - "longboat", - "longitude", - "look", - "lookout", - "loop", - "loophole", - "loquat", - "lord", - "loss", - "lot", - "lotion", - "lottery", - "lounge", - "louse", - "lout", - "love", - "lover", - "lox", - "loyalty", - "luck", - "luggage", - "lumber", - "lumberman", - "lunch", - "luncheonette", - "lunchmeat", - "lunchroom", - "lung", - "lunge", - "lute", - "luxury", - "lychee", - "lycra", - "lye", - "lymphocyte", - "lynx", - "lyocell", - "lyre", - "lyrics", - "lysine", - "macadamia", - "macaroni", - "macaroon", - "macaw", - "machine", - "machinery", - "macrame", - "macro", - "macrofauna", - "madam", - "maelstrom", - "maestro", - "magazine", - "maggot", - "magic", - "magnet", - "magnitude", - "maid", - "maiden", - "mail", - "mailbox", - "mailer", - "mailing", - "mailman", - "main", - "mainland", - "mainstream", - "maintainer", - "maintenance", - "maize", - "major", - "major-league", - "majority", - "makeover", - "maker", - "makeup", - "making", - "male", - "malice", - "mall", - "mallard", - "mallet", - "malnutrition", - "mama", - "mambo", - "mammoth", - "man", - "manacle", - "management", - "manager", - "manatee", - "mandarin", - "mandate", - "mandolin", - "mangle", - "mango", - "mangrove", - "manhunt", - "maniac", - "manicure", - "manifestation", - "manipulation", - "mankind", - "manner", - "manor", - "mansard", - "manservant", - "mansion", - "mantel", - "mantle", - "mantua", - "manufacturer", - "manufacturing", - "many", - "map", - "maple", - "mapping", - "maracas", - "marathon", - "marble", - "march", - "mare", - "margarine", - "margin", - "mariachi", - "marimba", - "marines", - "marionberry", - "mark", - "marker", - "market", - "marketer", - "marketing", - "marketplace", - "marksman", - "markup", - "marmalade", - "marriage", - "marsh", - "marshland", - "marshmallow", - "marten", - "marxism", - "mascara", - "mask", - "masonry", - "mass", - "massage", - "mast", - "master", - "masterpiece", - "mastication", - "mastoid", - "mat", - "match", - "matchmaker", - "mate", - "material", - "maternity", - "math", - "mathematics", - "matrix", - "matter", - "mattock", - "mattress", - "max", - "maximum", - "maybe", - "mayonnaise", - "mayor", - "meadow", - "meal", - "mean", - "meander", - "meaning", - "means", - "meantime", - "measles", - "measure", - "measurement", - "meat", - "meatball", - "meatloaf", - "mecca", - "mechanic", - "mechanism", - "med", - "medal", - "media", - "median", - "medication", - "medicine", - "medium", - "meet", - "meeting", - "melatonin", - "melody", - "melon", - "member", - "membership", - "membrane", - "meme", - "memo", - "memorial", - "memory", - "men", - "menopause", - "menorah", - "mention", - "mentor", - "menu", - "merchandise", - "merchant", - "mercury", - "meridian", - "meringue", - "merit", - "mesenchyme", - "mess", - "message", - "messenger", - "messy", - "metabolite", - "metal", - "metallurgist", - "metaphor", - "meteor", - "meteorology", - "meter", - "methane", - "method", - "methodology", - "metric", - "metro", - "metronome", - "mezzanine", - "microlending", - "micronutrient", - "microphone", - "microwave", - "mid-course", - "midden", - "middle", - "middleman", - "midline", - "midnight", - "midwife", - "might", - "migrant", - "migration", - "mile", - "mileage", - "milepost", - "milestone", - "military", - "milk", - "milkshake", - "mill", - "millennium", - "millet", - "millimeter", - "million", - "millisecond", - "millstone", - "mime", - "mimosa", - "min", - "mincemeat", - "mind", - "mine", - "mineral", - "mineshaft", - "mini", - "mini-skirt", - "minibus", - "minimalism", - "minimum", - "mining", - "minion", - "minister", - "mink", - "minnow", - "minor", - "minor-league", - "minority", - "mint", - "minute", - "miracle", - "mirror", - "miscarriage", - "miscommunication", - "misfit", - "misnomer", - "misogyny", - "misplacement", - "misreading", - "misrepresentation", - "miss", - "missile", - "mission", - "missionary", - "mist", - "mistake", - "mister", - "misunderstand", - "miter", - "mitten", - "mix", - "mixer", - "mixture", - "moai", - "moat", - "mob", - "mobile", - "mobility", - "mobster", - "moccasins", - "mocha", - "mochi", - "mode", - "model", - "modeling", - "modem", - "modernist", - "modernity", - "modification", - "molar", - "molasses", - "molding", - "mole", - "molecule", - "mom", - "moment", - "monastery", - "monasticism", - "money", - "monger", - "monitor", - "monitoring", - "monk", - "monkey", - "monocle", - "monopoly", - "monotheism", - "monsoon", - "monster", - "month", - "monument", - "mood", - "moody", - "moon", - "moonlight", - "moonscape", - "moonshine", - "moose", - "mop", - "morale", - "morbid", - "morbidity", - "morning", - "moron", - "morphology", - "morsel", - "mortal", - "mortality", - "mortgage", - "mortise", - "mosque", - "mosquito", - "most", - "motel", - "moth", - "mother", - "mother-in-law", - "motion", - "motivation", - "motive", - "motor", - "motorboat", - "motorcar", - "motorcycle", - "mound", - "mountain", - "mouse", - "mouser", - "mousse", - "moustache", - "mouth", - "mouton", - "movement", - "mover", - "movie", - "mower", - "mozzarella", - "mud", - "muffin", - "mug", - "mukluk", - "mule", - "multimedia", - "murder", - "muscat", - "muscatel", - "muscle", - "musculature", - "museum", - "mushroom", - "music", - "music-box", - "music-making", - "musician", - "muskrat", - "mussel", - "mustache", - "mustard", - "mutation", - "mutt", - "mutton", - "mycoplasma", - "mystery", - "myth", - "mythology", - "nail", - "name", - "naming", - "nanoparticle", - "napkin", - "narrative", - "nasal", - "nation", - "nationality", - "native", - "naturalisation", - "nature", - "navigation", - "necessity", - "neck", - "necklace", - "necktie", - "nectar", - "nectarine", - "need", - "needle", - "neglect", - "negligee", - "negotiation", - "neighbor", - "neighborhood", - "neighbour", - "neighbourhood", - "neologism", - "neon", - "neonate", - "nephew", - "nerve", - "nest", - "nestling", - "nestmate", - "net", - "netball", - "netbook", - "netsuke", - "network", - "networking", - "neurobiologist", - "neuron", - "neuropathologist", - "neuropsychiatry", - "news", - "newsletter", - "newspaper", - "newsprint", - "newsstand", - "nexus", - "nibble", - "nicety", - "niche", - "nick", - "nickel", - "nickname", - "niece", - "night", - "nightclub", - "nightgown", - "nightingale", - "nightlife", - "nightlight", - "nightmare", - "ninja", - "nit", - "nitrogen", - "nobody", - "nod", - "node", - "noir", - "noise", - "nonbeliever", - "nonconformist", - "nondisclosure", - "nonsense", - "noodle", - "noodles", - "noon", - "norm", - "normal", - "normalisation", - "normalization", - "north", - "nose", - "notation", - "note", - "notebook", - "notepad", - "nothing", - "notice", - "notion", - "notoriety", - "nougat", - "noun", - "nourishment", - "novel", - "nucleotidase", - "nucleotide", - "nudge", - "nuke", - "number", - "numeracy", - "numeric", - "numismatist", - "nun", - "nurse", - "nursery", - "nursing", - "nurture", - "nut", - "nutmeg", - "nutrient", - "nutrition", - "nylon", - "nymph", - "oak", - "oar", - "oasis", - "oat", - "oatmeal", - "oats", - "obedience", - "obesity", - "obi", - "object", - "objection", - "objective", - "obligation", - "oboe", - "observation", - "observatory", - "obsession", - "obsidian", - "obstacle", - "occasion", - "occupation", - "occurrence", - "ocean", - "ocelot", - "octagon", - "octave", - "octavo", - "octet", - "octopus", - "odometer", - "odyssey", - "oeuvre", - "off-ramp", - "offence", - "offense", - "offer", - "offering", - "office", - "officer", - "official", - "offset", - "oil", - "okra", - "oldie", - "oleo", - "olive", - "omega", - "omelet", - "omission", - "omnivore", - "oncology", - "onion", - "online", - "onset", - "opening", - "opera", - "operating", - "operation", - "operator", - "ophthalmologist", - "opinion", - "opium", - "opossum", - "opponent", - "opportunist", - "opportunity", - "opposite", - "opposition", - "optimal", - "optimisation", - "optimist", - "optimization", - "option", - "orange", - "orangutan", - "orator", - "orchard", - "orchestra", - "orchid", - "order", - "ordinary", - "ordination", - "ore", - "oregano", - "organ", - "organisation", - "organising", - "organization", - "organizing", - "orient", - "orientation", - "origin", - "original", - "originality", - "ornament", - "osmosis", - "osprey", - "ostrich", - "other", - "otter", - "ottoman", - "ounce", - "outback", - "outcome", - "outfielder", - "outfit", - "outhouse", - "outlaw", - "outlay", - "outlet", - "outline", - "outlook", - "output", - "outrage", - "outrigger", - "outrun", - "outset", - "outside", - "oval", - "ovary", - "oven", - "overcharge", - "overclocking", - "overcoat", - "overexertion", - "overflight", - "overhead", - "overheard", - "overload", - "overnighter", - "overshoot", - "oversight", - "overview", - "overweight", - "owl", - "owner", - "ownership", - "ox", - "oxford", - "oxygen", - "oyster", - "ozone", - "pace", - "pacemaker", - "pack", - "package", - "packaging", - "packet", - "pad", - "paddle", - "paddock", - "pagan", - "page", - "pagoda", - "pail", - "pain", - "paint", - "painter", - "painting", - "paintwork", - "pair", - "pajamas", - "palace", - "palate", - "palm", - "pamphlet", - "pan", - "pancake", - "pancreas", - "panda", - "panel", - "panic", - "pannier", - "panpipe", - "pansy", - "panther", - "panties", - "pantologist", - "pantology", - "pantry", - "pants", - "pantsuit", - "panty", - "pantyhose", - "papa", - "papaya", - "paper", - "paperback", - "paperwork", - "parable", - "parachute", - "parade", - "paradise", - "paragraph", - "parallelogram", - "paramecium", - "paramedic", - "parameter", - "paranoia", - "parcel", - "parchment", - "pard", - "pardon", - "parent", - "parenthesis", - "parenting", - "park", - "parka", - "parking", - "parliament", - "parole", - "parrot", - "parser", - "parsley", - "parsnip", - "part", - "participant", - "participation", - "particle", - "particular", - "partner", - "partnership", - "partridge", - "party", - "pass", - "passage", - "passbook", - "passenger", - "passing", - "passion", - "passive", - "passport", - "password", - "past", - "pasta", - "paste", - "pastor", - "pastoralist", - "pastry", - "pasture", - "pat", - "patch", - "pate", - "patent", - "patentee", - "path", - "pathogenesis", - "pathology", - "pathway", - "patience", - "patient", - "patina", - "patio", - "patriarch", - "patrimony", - "patriot", - "patrol", - "patroller", - "patrolling", - "patron", - "pattern", - "patty", - "pattypan", - "pause", - "pavement", - "pavilion", - "paw", - "pawnshop", - "pay", - "payee", - "payment", - "payoff", - "pea", - "peace", - "peach", - "peacoat", - "peacock", - "peak", - "peanut", - "pear", - "pearl", - "peasant", - "pecan", - "pedal", - "peek", - "peen", - "peer", - "peer-to-peer", - "pegboard", - "pelican", - "pelt", - "pen", - "penalty", - "pence", - "pencil", - "pendant", - "pendulum", - "penguin", - "penicillin", - "peninsula", - "pennant", - "penny", - "pension", - "pentagon", - "peony", - "people", - "pepper", - "pepperoni", - "percent", - "percentage", - "perception", - "perch", - "perennial", - "perfection", - "performance", - "perfume", - "period", - "periodical", - "peripheral", - "permafrost", - "permission", - "permit", - "perp", - "perpendicular", - "persimmon", - "person", - "personal", - "personality", - "personnel", - "perspective", - "pest", - "pet", - "petal", - "petition", - "petitioner", - "petticoat", - "pew", - "pharmacist", - "pharmacopoeia", - "phase", - "pheasant", - "phenomenon", - "phenotype", - "pheromone", - "philanthropy", - "philosopher", - "philosophy", - "phone", - "phosphate", - "photo", - "photodiode", - "photograph", - "photographer", - "photography", - "photoreceptor", - "phrase", - "phrasing", - "physical", - "physics", - "physiology", - "pianist", - "piano", - "piccolo", - "pick", - "pickax", - "pickaxe", - "picket", - "pickle", - "pickup", - "picnic", - "picture", - "picturesque", - "pie", - "piece", - "pier", - "piety", - "pig", - "pigeon", - "piglet", - "pigpen", - "pigsty", - "pike", - "pilaf", - "pile", - "pilgrim", - "pilgrimage", - "pill", - "pillar", - "pillbox", - "pillow", - "pilot", - "pimp", - "pimple", - "pin", - "pinafore", - "pince-nez", - "pine", - "pineapple", - "pinecone", - "ping", - "pink", - "pinkie", - "pinot", - "pinstripe", - "pint", - "pinto", - "pinworm", - "pioneer", - "pipe", - "pipeline", - "piracy", - "pirate", - "pistol", - "pit", - "pita", - "pitch", - "pitcher", - "pitching", - "pith", - "pizza", - "place", - "placebo", - "placement", - "placode", - "plagiarism", - "plain", - "plaintiff", - "plan", - "plane", - "planet", - "planning", - "plant", - "plantation", - "planter", - "planula", - "plaster", - "plasterboard", - "plastic", - "plate", - "platelet", - "platform", - "platinum", - "platter", - "platypus", - "play", - "player", - "playground", - "playroom", - "playwright", - "plea", - "pleasure", - "pleat", - "pledge", - "plenty", - "plier", - "pliers", - "plight", - "plot", - "plough", - "plover", - "plow", - "plowman", - "plug", - "plugin", - "plum", - "plumber", - "plume", - "plunger", - "plywood", - "pneumonia", - "pocket", - "pocket-watch", - "pocketbook", - "pod", - "podcast", - "poem", - "poet", - "poetry", - "poignance", - "point", - "poison", - "poisoning", - "poker", - "polarisation", - "polarization", - "pole", - "polenta", - "police", - "policeman", - "policy", - "polish", - "politician", - "politics", - "poll", - "polliwog", - "pollutant", - "pollution", - "polo", - "polyester", - "polyp", - "pomegranate", - "pomelo", - "pompom", - "poncho", - "pond", - "pony", - "pool", - "poor", - "pop", - "popcorn", - "poppy", - "popsicle", - "popularity", - "population", - "populist", - "porcelain", - "porch", - "porcupine", - "pork", - "porpoise", - "port", - "porter", - "portfolio", - "porthole", - "portion", - "portrait", - "position", - "possession", - "possibility", - "possible", - "post", - "postage", - "postbox", - "poster", - "posterior", - "postfix", - "pot", - "potato", - "potential", - "pottery", - "potty", - "pouch", - "poultry", - "pound", - "pounding", - "poverty", - "powder", - "power", - "practice", - "practitioner", - "prairie", - "praise", - "pray", - "prayer", - "precedence", - "precedent", - "precipitation", - "precision", - "predecessor", - "preface", - "preference", - "prefix", - "pregnancy", - "prejudice", - "prelude", - "premeditation", - "premier", - "premise", - "premium", - "preoccupation", - "preparation", - "prescription", - "presence", - "present", - "presentation", - "preservation", - "preserves", - "presidency", - "president", - "press", - "pressroom", - "pressure", - "pressurisation", - "pressurization", - "prestige", - "presume", - "pretzel", - "prevalence", - "prevention", - "prey", - "price", - "pricing", - "pride", - "priest", - "priesthood", - "primary", - "primate", - "prince", - "princess", - "principal", - "principle", - "print", - "printer", - "printing", - "prior", - "priority", - "prison", - "prisoner", - "privacy", - "private", - "privilege", - "prize", - "prizefight", - "probability", - "probation", - "probe", - "problem", - "procedure", - "proceedings", - "process", - "processing", - "processor", - "proctor", - "procurement", - "produce", - "producer", - "product", - "production", - "productivity", - "profession", - "professional", - "professor", - "profile", - "profit", - "progenitor", - "program", - "programme", - "programming", - "progress", - "progression", - "prohibition", - "project", - "proliferation", - "promenade", - "promise", - "promotion", - "prompt", - "pronoun", - "pronunciation", - "proof", - "proof-reader", - "propaganda", - "propane", - "property", - "prophet", - "proponent", - "proportion", - "proposal", - "proposition", - "proprietor", - "prose", - "prosecution", - "prosecutor", - "prospect", - "prosperity", - "prostacyclin", - "prostanoid", - "prostrate", - "protection", - "protein", - "protest", - "protocol", - "providence", - "provider", - "province", - "provision", - "prow", - "proximal", - "proximity", - "prune", - "pruner", - "pseudocode", - "pseudoscience", - "psychiatrist", - "psychoanalyst", - "psychologist", - "psychology", - "ptarmigan", - "pub", - "public", - "publication", - "publicity", - "publisher", - "publishing", - "pudding", - "puddle", - "puffin", - "pug", - "puggle", - "pulley", - "pulse", - "puma", - "pump", - "pumpernickel", - "pumpkin", - "pumpkinseed", - "pun", - "punch", - "punctuation", - "punishment", - "pup", - "pupa", - "pupil", - "puppet", - "puppy", - "purchase", - "puritan", - "purity", - "purple", - "purpose", - "purr", - "purse", - "pursuit", - "push", - "pusher", - "put", - "puzzle", - "pyramid", - "pyridine", - "quadrant", - "quail", - "qualification", - "quality", - "quantity", - "quart", - "quarter", - "quartet", - "quartz", - "queen", - "query", - "quest", - "question", - "questioner", - "questionnaire", - "quiche", - "quicksand", - "quiet", - "quill", - "quilt", - "quince", - "quinoa", - "quit", - "quiver", - "quota", - "quotation", - "quote", - "rabbi", - "rabbit", - "raccoon", - "race", - "racer", - "racing", - "racism", - "racist", - "rack", - "radar", - "radiator", - "radio", - "radiosonde", - "radish", - "raffle", - "raft", - "rag", - "rage", - "raid", - "rail", - "railing", - "railroad", - "railway", - "raiment", - "rain", - "rainbow", - "raincoat", - "rainmaker", - "rainstorm", - "rainy", - "raise", - "raisin", - "rake", - "rally", - "ram", - "rambler", - "ramen", - "ramie", - "ranch", - "rancher", - "randomisation", - "randomization", - "range", - "ranger", - "rank", - "rap", - "rape", - "raspberry", - "rat", - "rate", - "ratepayer", - "rating", - "ratio", - "rationale", - "rations", - "raven", - "ravioli", - "rawhide", - "ray", - "rayon", - "razor", - "reach", - "reactant", - "reaction", - "read", - "reader", - "readiness", - "reading", - "real", - "reality", - "realization", - "realm", - "reamer", - "rear", - "reason", - "reasoning", - "rebel", - "rebellion", - "reboot", - "recall", - "recapitulation", - "receipt", - "receiver", - "reception", - "receptor", - "recess", - "recession", - "recipe", - "recipient", - "reciprocity", - "reclamation", - "recliner", - "recognition", - "recollection", - "recommendation", - "reconsideration", - "record", - "recorder", - "recording", - "recovery", - "recreation", - "recruit", - "rectangle", - "red", - "redesign", - "redhead", - "redirect", - "rediscovery", - "reduction", - "reef", - "refectory", - "reference", - "referendum", - "reflection", - "reform", - "refreshments", - "refrigerator", - "refuge", - "refund", - "refusal", - "refuse", - "regard", - "regime", - "region", - "regionalism", - "register", - "registration", - "registry", - "regret", - "regulation", - "regulator", - "rehospitalisation", - "rehospitalization", - "reindeer", - "reinscription", - "reject", - "relation", - "relationship", - "relative", - "relaxation", - "relay", - "release", - "reliability", - "relief", - "religion", - "relish", - "reluctance", - "remains", - "remark", - "reminder", - "remnant", - "remote", - "removal", - "renaissance", - "rent", - "reorganisation", - "reorganization", - "repair", - "reparation", - "repayment", - "repeat", - "replacement", - "replica", - "replication", - "reply", - "report", - "reporter", - "reporting", - "repository", - "representation", - "representative", - "reprocessing", - "republic", - "republican", - "reputation", - "request", - "requirement", - "resale", - "rescue", - "research", - "researcher", - "resemblance", - "reservation", - "reserve", - "reservoir", - "reset", - "residence", - "resident", - "residue", - "resist", - "resistance", - "resolution", - "resolve", - "resort", - "resource", - "respect", - "respite", - "response", - "responsibility", - "rest", - "restaurant", - "restoration", - "restriction", - "restroom", - "restructuring", - "result", - "resume", - "retailer", - "retention", - "rethinking", - "retina", - "retirement", - "retouching", - "retreat", - "retrospect", - "retrospective", - "retrospectivity", - "return", - "reunion", - "revascularisation", - "revascularization", - "reveal", - "revelation", - "revenant", - "revenge", - "revenue", - "reversal", - "reverse", - "review", - "revitalisation", - "revitalization", - "revival", - "revolution", - "revolver", - "reward", - "rhetoric", - "rheumatism", - "rhinoceros", - "rhubarb", - "rhyme", - "rhythm", - "rib", - "ribbon", - "rice", - "riddle", - "ride", - "rider", - "ridge", - "riding", - "rifle", - "right", - "rim", - "ring", - "ringworm", - "riot", - "rip", - "ripple", - "rise", - "riser", - "risk", - "rite", - "ritual", - "river", - "riverbed", - "rivulet", - "road", - "roadway", - "roar", - "roast", - "robe", - "robin", - "robot", - "robotics", - "rock", - "rocker", - "rocket", - "rocket-ship", - "rod", - "role", - "roll", - "roller", - "romaine", - "romance", - "roof", - "room", - "roommate", - "rooster", - "root", - "rope", - "rose", - "rosemary", - "roster", - "rostrum", - "rotation", - "round", - "roundabout", - "route", - "router", - "routine", - "row", - "rowboat", - "rowing", - "rubber", - "rubbish", - "rubric", - "ruby", - "ruckus", - "rudiment", - "ruffle", - "rug", - "rugby", - "ruin", - "rule", - "ruler", - "ruling", - "rum", - "rumor", - "run", - "runaway", - "runner", - "running", - "runway", - "rush", - "rust", - "rutabaga", - "rye", - "sabre", - "sac", - "sack", - "saddle", - "sadness", - "safari", - "safe", - "safeguard", - "safety", - "saffron", - "sage", - "sail", - "sailboat", - "sailing", - "sailor", - "saint", - "sake", - "salad", - "salami", - "salary", - "sale", - "salesman", - "salmon", - "salon", - "saloon", - "salsa", - "salt", - "salute", - "samovar", - "sampan", - "sample", - "samurai", - "sanction", - "sanctity", - "sanctuary", - "sand", - "sandal", - "sandbar", - "sandpaper", - "sandwich", - "sanity", - "sardine", - "sari", - "sarong", - "sash", - "satellite", - "satin", - "satire", - "satisfaction", - "sauce", - "saucer", - "sauerkraut", - "sausage", - "savage", - "savannah", - "saving", - "savings", - "savior", - "saviour", - "savory", - "saw", - "saxophone", - "scaffold", - "scale", - "scallion", - "scallops", - "scalp", - "scam", - "scanner", - "scarecrow", - "scarf", - "scarification", - "scenario", - "scene", - "scenery", - "scent", - "schedule", - "scheduling", - "schema", - "scheme", - "schizophrenic", - "schnitzel", - "scholar", - "scholarship", - "school", - "schoolhouse", - "schooner", - "science", - "scientist", - "scimitar", - "scissors", - "scooter", - "scope", - "score", - "scorn", - "scorpion", - "scotch", - "scout", - "scow", - "scrambled", - "scrap", - "scraper", - "scratch", - "screamer", - "screen", - "screening", - "screenwriting", - "screw", - "screw-up", - "screwdriver", - "scrim", - "scrip", - "script", - "scripture", - "scrutiny", - "sculpting", - "sculptural", - "sculpture", - "sea", - "seabass", - "seafood", - "seagull", - "seal", - "seaplane", - "search", - "seashore", - "seaside", - "season", - "seat", - "seaweed", - "second", - "secrecy", - "secret", - "secretariat", - "secretary", - "secretion", - "section", - "sectional", - "sector", - "security", - "sediment", - "seed", - "seeder", - "seeker", - "seep", - "segment", - "seizure", - "selection", - "self", - "self-confidence", - "self-control", - "self-esteem", - "seller", - "selling", - "semantics", - "semester", - "semicircle", - "semicolon", - "semiconductor", - "seminar", - "senate", - "senator", - "sender", - "senior", - "sense", - "sensibility", - "sensitive", - "sensitivity", - "sensor", - "sentence", - "sentencing", - "sentiment", - "sepal", - "separation", - "septicaemia", - "sequel", - "sequence", - "serial", - "series", - "sermon", - "serum", - "serval", - "servant", - "server", - "service", - "servitude", - "sesame", - "session", - "set", - "setback", - "setting", - "settlement", - "settler", - "severity", - "sewer", - "sexuality", - "shack", - "shackle", - "shade", - "shadow", - "shadowbox", - "shakedown", - "shaker", - "shallot", - "shallows", - "shame", - "shampoo", - "shanty", - "shape", - "share", - "shareholder", - "shark", - "shaw", - "shawl", - "shear", - "shearling", - "sheath", - "shed", - "sheep", - "sheet", - "shelf", - "shell", - "shelter", - "sherbet", - "sherry", - "shield", - "shift", - "shin", - "shine", - "shingle", - "ship", - "shipper", - "shipping", - "shipyard", - "shirt", - "shirtdress", - "shoat", - "shock", - "shoe", - "shoe-horn", - "shoehorn", - "shoelace", - "shoemaker", - "shoes", - "shoestring", - "shofar", - "shoot", - "shootdown", - "shop", - "shopper", - "shopping", - "shore", - "shoreline", - "short", - "shortage", - "shorts", - "shortwave", - "shot", - "shoulder", - "shout", - "shovel", - "show", - "show-stopper", - "shower", - "shred", - "shrimp", - "shrine", - "shutdown", - "sibling", - "sick", - "sickness", - "side", - "sideboard", - "sideburns", - "sidecar", - "sidestream", - "sidewalk", - "siding", - "siege", - "sigh", - "sight", - "sightseeing", - "sign", - "signal", - "signature", - "signet", - "significance", - "signify", - "signup", - "silence", - "silica", - "silicon", - "silk", - "silkworm", - "sill", - "silly", - "silo", - "silver", - "similarity", - "simple", - "simplicity", - "simplification", - "simvastatin", - "sin", - "singer", - "singing", - "singular", - "sink", - "sinuosity", - "sip", - "sir", - "sister", - "sister-in-law", - "sitar", - "site", - "situation", - "size", - "skate", - "skating", - "skean", - "skeleton", - "ski", - "skiing", - "skill", - "skin", - "skirt", - "skull", - "skullcap", - "skullduggery", - "skunk", - "sky", - "skylight", - "skyline", - "skyscraper", - "skywalk", - "slang", - "slapstick", - "slash", - "slate", - "slave", - "slavery", - "slaw", - "sled", - "sledge", - "sleep", - "sleepiness", - "sleeping", - "sleet", - "sleuth", - "slice", - "slide", - "slider", - "slime", - "slip", - "slipper", - "slippers", - "slope", - "slot", - "sloth", - "slump", - "smell", - "smelting", - "smile", - "smith", - "smock", - "smog", - "smoke", - "smoking", - "smolt", - "smuggling", - "snack", - "snail", - "snake", - "snakebite", - "snap", - "snarl", - "sneaker", - "sneakers", - "sneeze", - "sniffle", - "snob", - "snorer", - "snow", - "snowboarding", - "snowflake", - "snowman", - "snowmobiling", - "snowplow", - "snowstorm", - "snowsuit", - "snuck", - "snug", - "snuggle", - "soap", - "soccer", - "socialism", - "socialist", - "society", - "sociology", - "sock", - "socks", - "soda", - "sofa", - "softball", - "softdrink", - "softening", - "software", - "soil", - "soldier", - "sole", - "solicitation", - "solicitor", - "solidarity", - "solidity", - "soliloquy", - "solitaire", - "solution", - "solvency", - "sombrero", - "somebody", - "someone", - "someplace", - "somersault", - "something", - "somewhere", - "son", - "sonar", - "sonata", - "song", - "songbird", - "sonnet", - "soot", - "sophomore", - "soprano", - "sorbet", - "sorghum", - "sorrel", - "sorrow", - "sort", - "soul", - "soulmate", - "sound", - "soundness", - "soup", - "source", - "sourwood", - "sousaphone", - "south", - "southeast", - "souvenir", - "sovereignty", - "sow", - "soy", - "soybean", - "space", - "spacing", - "spade", - "spaghetti", - "span", - "spandex", - "spank", - "sparerib", - "spark", - "sparrow", - "spasm", - "spat", - "spatula", - "spawn", - "speaker", - "speakerphone", - "speaking", - "spear", - "spec", - "special", - "specialist", - "specialty", - "species", - "specification", - "spectacle", - "spectacles", - "spectrograph", - "spectrum", - "speculation", - "speech", - "speed", - "speedboat", - "spell", - "spelling", - "spelt", - "spending", - "sphere", - "sphynx", - "spice", - "spider", - "spiderling", - "spike", - "spill", - "spinach", - "spine", - "spiral", - "spirit", - "spiritual", - "spirituality", - "spit", - "spite", - "spleen", - "splendor", - "split", - "spokesman", - "spokeswoman", - "sponge", - "sponsor", - "sponsorship", - "spool", - "spoon", - "spork", - "sport", - "sportsman", - "spot", - "spotlight", - "spouse", - "sprag", - "sprat", - "spray", - "spread", - "spreadsheet", - "spree", - "spring", - "sprinkles", - "sprinter", - "sprout", - "spruce", - "spud", - "spume", - "spur", - "spy", - "spyglass", - "square", - "squash", - "squatter", - "squeegee", - "squid", - "squirrel", - "stab", - "stability", - "stable", - "stack", - "stacking", - "stadium", - "staff", - "stag", - "stage", - "stain", - "stair", - "staircase", - "stake", - "stalk", - "stall", - "stallion", - "stamen", - "stamina", - "stamp", - "stance", - "stand", - "standard", - "standardisation", - "standardization", - "standing", - "standoff", - "standpoint", - "star", - "starboard", - "start", - "starter", - "state", - "statement", - "statin", - "station", - "station-wagon", - "statistic", - "statistics", - "statue", - "status", - "statute", - "stay", - "steak", - "stealth", - "steam", - "steamroller", - "steel", - "steeple", - "stem", - "stench", - "stencil", - "step", - "step-aunt", - "step-brother", - "step-daughter", - "step-father", - "step-grandfather", - "step-grandmother", - "step-mother", - "step-sister", - "step-son", - "step-uncle", - "stepdaughter", - "stepmother", - "stepping-stone", - "stepson", - "stereo", - "stew", - "steward", - "stick", - "sticker", - "stiletto", - "still", - "stimulation", - "stimulus", - "sting", - "stinger", - "stir-fry", - "stitch", - "stitcher", - "stock", - "stock-in-trade", - "stockings", - "stole", - "stomach", - "stone", - "stonework", - "stool", - "stop", - "stopsign", - "stopwatch", - "storage", - "store", - "storey", - "storm", - "story", - "story-telling", - "storyboard", - "stot", - "stove", - "strait", - "strand", - "stranger", - "strap", - "strategy", - "straw", - "strawberry", - "strawman", - "stream", - "street", - "streetcar", - "strength", - "stress", - "stretch", - "strife", - "strike", - "string", - "strip", - "stripe", - "strobe", - "stroke", - "structure", - "strudel", - "struggle", - "stucco", - "stud", - "student", - "studio", - "study", - "stuff", - "stumbling", - "stump", - "stupidity", - "sturgeon", - "sty", - "style", - "styling", - "stylus", - "sub", - "subcomponent", - "subconscious", - "subcontractor", - "subexpression", - "subgroup", - "subject", - "submarine", - "submitter", - "subprime", - "subroutine", - "subscription", - "subsection", - "subset", - "subsidence", - "subsidiary", - "subsidy", - "substance", - "substitution", - "subtitle", - "suburb", - "subway", - "success", - "succotash", - "suck", - "sucker", - "suede", - "suet", - "suffocation", - "sugar", - "suggestion", - "suicide", - "suit", - "suitcase", - "suite", - "sulfur", - "sultan", - "sum", - "summary", - "summer", - "summit", - "sun", - "sunbeam", - "sunbonnet", - "sundae", - "sunday", - "sundial", - "sunflower", - "sunglasses", - "sunlamp", - "sunlight", - "sunrise", - "sunroom", - "sunset", - "sunshine", - "superiority", - "supermarket", - "supernatural", - "supervision", - "supervisor", - "supper", - "supplement", - "supplier", - "supply", - "support", - "supporter", - "suppression", - "supreme", - "surface", - "surfboard", - "surge", - "surgeon", - "surgery", - "surname", - "surplus", - "surprise", - "surround", - "surroundings", - "surrounds", - "survey", - "survival", - "survivor", - "sushi", - "suspect", - "suspenders", - "suspension", - "sustainment", - "sustenance", - "swallow", - "swamp", - "swan", - "swanling", - "swath", - "sweat", - "sweater", - "sweatshirt", - "sweatshop", - "sweatsuit", - "sweets", - "swell", - "swim", - "swimming", - "swimsuit", - "swine", - "swing", - "switch", - "switchboard", - "switching", - "swivel", - "sword", - "swordfight", - "swordfish", - "sycamore", - "symbol", - "symmetry", - "sympathy", - "symptom", - "syndicate", - "syndrome", - "synergy", - "synod", - "synonym", - "synthesis", - "syrup", - "system", - "t-shirt", - "tab", - "tabby", - "tabernacle", - "table", - "tablecloth", - "tablet", - "tabletop", - "tachometer", - "tackle", - "taco", - "tactics", - "tactile", - "tadpole", - "tag", - "tail", - "tailbud", - "tailor", - "tailspin", - "take-out", - "takeover", - "tale", - "talent", - "talk", - "talking", - "tam-o'-shanter", - "tamale", - "tambour", - "tambourine", - "tan", - "tandem", - "tangerine", - "tank", - "tank-top", - "tanker", - "tankful", - "tap", - "tape", - "tapioca", - "target", - "taro", - "tarragon", - "tart", - "task", - "tassel", - "taste", - "tatami", - "tattler", - "tattoo", - "tavern", - "tax", - "taxi", - "taxicab", - "taxpayer", - "tea", - "teacher", - "teaching", - "team", - "teammate", - "teapot", - "tear", - "tech", - "technician", - "technique", - "technologist", - "technology", - "tectonics", - "teen", - "teenager", - "teepee", - "telephone", - "telescreen", - "teletype", - "television", - "tell", - "teller", - "temp", - "temper", - "temperature", - "temple", - "tempo", - "temporariness", - "temporary", - "temptation", - "temptress", - "tenant", - "tendency", - "tender", - "tenement", - "tenet", - "tennis", - "tenor", - "tension", - "tensor", - "tent", - "tentacle", - "tenth", - "tepee", - "teriyaki", - "term", - "terminal", - "termination", - "terminology", - "termite", - "terrace", - "terracotta", - "terrapin", - "terrarium", - "territory", - "terror", - "terrorism", - "terrorist", - "test", - "testament", - "testimonial", - "testimony", - "testing", - "text", - "textbook", - "textual", - "texture", - "thanks", - "thaw", - "theater", - "theft", - "theism", - "theme", - "theology", - "theory", - "therapist", - "therapy", - "thermals", - "thermometer", - "thermostat", - "thesis", - "thickness", - "thief", - "thigh", - "thing", - "thinking", - "thirst", - "thistle", - "thong", - "thongs", - "thorn", - "thought", - "thousand", - "thread", - "threat", - "threshold", - "thrift", - "thrill", - "throat", - "throne", - "thrush", - "thrust", - "thug", - "thumb", - "thump", - "thunder", - "thunderbolt", - "thunderhead", - "thunderstorm", - "thyme", - "tiara", - "tic", - "tick", - "ticket", - "tide", - "tie", - "tiger", - "tights", - "tile", - "till", - "tilt", - "timbale", - "timber", - "time", - "timeline", - "timeout", - "timer", - "timetable", - "timing", - "timpani", - "tin", - "tinderbox", - "tinkle", - "tintype", - "tip", - "tire", - "tissue", - "titanium", - "title", - "toad", - "toast", - "toaster", - "tobacco", - "today", - "toe", - "toenail", - "toffee", - "tofu", - "tog", - "toga", - "toilet", - "tolerance", - "tolerant", - "toll", - "tom-tom", - "tomatillo", - "tomato", - "tomb", - "tomography", - "tomorrow", - "ton", - "tonality", - "tone", - "tongue", - "tonic", - "tonight", - "tool", - "toot", - "tooth", - "toothbrush", - "toothpaste", - "toothpick", - "top", - "top-hat", - "topic", - "topsail", - "toque", - "toreador", - "tornado", - "torso", - "torte", - "tortellini", - "tortilla", - "tortoise", - "total", - "tote", - "touch", - "tough-guy", - "tour", - "tourism", - "tourist", - "tournament", - "tow-truck", - "towel", - "tower", - "town", - "townhouse", - "township", - "toy", - "trace", - "trachoma", - "track", - "tracking", - "tracksuit", - "tract", - "tractor", - "trade", - "trader", - "trading", - "tradition", - "traditionalism", - "traffic", - "trafficker", - "tragedy", - "trail", - "trailer", - "trailpatrol", - "train", - "trainer", - "training", - "trait", - "tram", - "tramp", - "trance", - "transaction", - "transcript", - "transfer", - "transformation", - "transit", - "transition", - "translation", - "transmission", - "transom", - "transparency", - "transplantation", - "transport", - "transportation", - "trap", - "trapdoor", - "trapezium", - "trapezoid", - "trash", - "travel", - "traveler", - "tray", - "treasure", - "treasury", - "treat", - "treatment", - "treaty", - "tree", - "trek", - "trellis", - "tremor", - "trench", - "trend", - "triad", - "trial", - "triangle", - "tribe", - "tributary", - "trick", - "trigger", - "trigonometry", - "trillion", - "trim", - "trinket", - "trip", - "tripod", - "tritone", - "triumph", - "trolley", - "trombone", - "troop", - "trooper", - "trophy", - "trouble", - "trousers", - "trout", - "trove", - "trowel", - "truck", - "trumpet", - "trunk", - "trust", - "trustee", - "truth", - "try", - "tsunami", - "tub", - "tuba", - "tube", - "tuber", - "tug", - "tugboat", - "tuition", - "tulip", - "tumbler", - "tummy", - "tuna", - "tune", - "tune-up", - "tunic", - "tunnel", - "turban", - "turf", - "turkey", - "turmeric", - "turn", - "turning", - "turnip", - "turnover", - "turnstile", - "turret", - "turtle", - "tusk", - "tussle", - "tutu", - "tuxedo", - "tweet", - "tweezers", - "twig", - "twilight", - "twine", - "twins", - "twist", - "twister", - "twitter", - "type", - "typeface", - "typewriter", - "typhoon", - "ukulele", - "ultimatum", - "umbrella", - "unblinking", - "uncertainty", - "uncle", - "underclothes", - "underestimate", - "underground", - "underneath", - "underpants", - "underpass", - "undershirt", - "understanding", - "understatement", - "undertaker", - "underwear", - "underweight", - "underwire", - "underwriting", - "unemployment", - "unibody", - "uniform", - "uniformity", - "union", - "unique", - "unit", - "unity", - "universe", - "university", - "update", - "upgrade", - "uplift", - "upper", - "upstairs", - "upward", - "urge", - "urgency", - "urn", - "usage", - "use", - "user", - "usher", - "usual", - "utensil", - "utilisation", - "utility", - "utilization", - "vacation", - "vaccine", - "vacuum", - "vagrant", - "valance", - "valentine", - "validate", - "validity", - "valley", - "valuable", - "value", - "vampire", - "van", - "vanadyl", - "vane", - "vanilla", - "vanity", - "variability", - "variable", - "variant", - "variation", - "variety", - "vascular", - "vase", - "vault", - "vaulting", - "veal", - "vector", - "vegetable", - "vegetarian", - "vegetarianism", - "vegetation", - "vehicle", - "veil", - "vein", - "veldt", - "vellum", - "velocity", - "velodrome", - "velvet", - "vendor", - "veneer", - "vengeance", - "venison", - "venom", - "venti", - "venture", - "venue", - "veranda", - "verb", - "verdict", - "verification", - "vermicelli", - "vernacular", - "verse", - "version", - "vertigo", - "verve", - "vessel", - "vest", - "vestment", - "vet", - "veteran", - "veterinarian", - "veto", - "viability", - "vibe", - "vibraphone", - "vibration", - "vibrissae", - "vice", - "vicinity", - "victim", - "victory", - "video", - "view", - "viewer", - "vignette", - "villa", - "village", - "vine", - "vinegar", - "vineyard", - "vintage", - "vintner", - "vinyl", - "viola", - "violation", - "violence", - "violet", - "violin", - "virginal", - "virtue", - "virus", - "visa", - "viscose", - "vise", - "vision", - "visit", - "visitor", - "visor", - "vista", - "visual", - "vitality", - "vitamin", - "vitro", - "vivo", - "vixen", - "vodka", - "vogue", - "voice", - "void", - "vol", - "volatility", - "volcano", - "volleyball", - "volume", - "volunteer", - "volunteering", - "vomit", - "vote", - "voter", - "voting", - "voyage", - "vulture", - "wad", - "wafer", - "waffle", - "wage", - "wagon", - "waist", - "waistband", - "wait", - "waiter", - "waiting", - "waitress", - "waiver", - "wake", - "walk", - "walker", - "walking", - "walkway", - "wall", - "wallaby", - "wallet", - "walnut", - "walrus", - "wampum", - "wannabe", - "want", - "war", - "warden", - "wardrobe", - "warfare", - "warlock", - "warlord", - "warm-up", - "warming", - "warmth", - "warning", - "warrant", - "warren", - "warrior", - "wasabi", - "wash", - "washbasin", - "washcloth", - "washer", - "washtub", - "wasp", - "waste", - "wastebasket", - "wasting", - "watch", - "watcher", - "watchmaker", - "water", - "waterbed", - "watercress", - "waterfall", - "waterfront", - "watermelon", - "waterskiing", - "waterspout", - "waterwheel", - "wave", - "waveform", - "wax", - "way", - "weakness", - "wealth", - "weapon", - "wear", - "weasel", - "weather", - "web", - "webinar", - "webmail", - "webpage", - "website", - "wedding", - "wedge", - "weed", - "weeder", - "weedkiller", - "week", - "weekend", - "weekender", - "weight", - "weird", - "welcome", - "welfare", - "well", - "well-being", - "west", - "western", - "wet-bar", - "wetland", - "wetsuit", - "whack", - "whale", - "wharf", - "wheat", - "wheel", - "whelp", - "whey", - "whip", - "whirlpool", - "whirlwind", - "whisker", - "whiskey", - "whisper", - "whistle", - "white", - "whole", - "wholesale", - "wholesaler", - "whorl", - "wick", - "widget", - "widow", - "width", - "wife", - "wifi", - "wild", - "wildebeest", - "wilderness", - "wildlife", - "will", - "willingness", - "willow", - "win", - "wind", - "wind-chime", - "windage", - "window", - "windscreen", - "windshield", - "wine", - "winery", - "wing", - "wingman", - "wingtip", - "wink", - "winner", - "winter", - "wire", - "wiretap", - "wiring", - "wisdom", - "wiseguy", - "wish", - "wisteria", - "wit", - "witch", - "witch-hunt", - "withdrawal", - "witness", - "wok", - "wolf", - "woman", - "wombat", - "wonder", - "wont", - "wood", - "woodchuck", - "woodland", - "woodshed", - "woodwind", - "wool", - "woolens", - "word", - "wording", - "work", - "workbench", - "worker", - "workforce", - "workhorse", - "working", - "workout", - "workplace", - "workshop", - "world", - "worm", - "worry", - "worship", - "worshiper", - "worth", - "wound", - "wrap", - "wraparound", - "wrapper", - "wrapping", - "wreck", - "wrecker", - "wren", - "wrench", - "wrestler", - "wriggler", - "wrinkle", - "wrist", - "writer", - "writing", - "wrong", - "xylophone", - "yacht", - "yahoo", - "yak", - "yam", - "yang", - "yard", - "yarmulke", - "yarn", - "yawl", - "year", - "yeast", - "yellow", - "yellowjacket", - "yesterday", - "yew", - "yin", - "yoga", - "yogurt", - "yoke", - "yolk", - "young", - "youngster", - "yourself", - "youth", - "yoyo", - "yurt", - "zampone", - "zebra", - "zebrafish", - "zen", - "zephyr", - "zero", - "ziggurat", - "zinc", - "zipper", - "zither", - "zombie", - "zone", - "zoo", - "zoologist", - "zoology", - "zoot-suit", - "zucchini", - ] diff --git a/src/fmcore/util/language/_structs.py b/src/fmcore/util/language/_structs.py deleted file mode 100644 index de4a6a4..0000000 --- a/src/fmcore/util/language/_structs.py +++ /dev/null @@ -1,904 +0,0 @@ -import random -import re -from ast import literal_eval -from collections import defaultdict -from contextlib import contextmanager -from typing import * - -import numpy as np -import pandas as pd -from pydantic.typing import Literal - -from ._alias import set_param_from_alias -from ._autoenum import AutoEnum -from ._import import optional_dependency -from ._utils import get_default, is_not_null - -ListOrTuple = Union[List, Tuple] -DataFrameOrSeries = Union[pd.Series, pd.DataFrame] -SeriesOrArray1D = Union[pd.Series, List, Tuple, np.ndarray] -DataFrameOrArray2D = Union[pd.Series, pd.DataFrame, List, List[List], np.ndarray] -SeriesOrArray1DOrDataFrameOrArray2D = Union[SeriesOrArray1D, DataFrameOrArray2D] - - -def not_impl( - param_name: str, - param_val: Any, - supported: Optional[Union[List, Set, Tuple, Any]] = None, -) -> Exception: - if not isinstance(param_name, str): - raise ValueError("First value `param_name` must be a string.") - param_val_str: str = str(param_val) - if len(param_val_str) > 100: - param_val_str: str = "\n" + param_val_str - if supported is not None: - supported: List = as_list(supported) - return NotImplementedError( - f"Unsupported value for param `{param_name}`. Valid values are: {supported}; " - f"found {type(param_val)} having value: {param_val_str}" - ) - - return NotImplementedError( - f"Unsupported value for param `{param_name}`; found {type(param_val)} having value: {param_val_str}" - ) - - -## ======================== List utils ======================== ## -def is_list_like(l: Any) -> bool: - with optional_dependency("dask"): - from dask.dataframe.core import Series as DaskSeries - - if isinstance(l, (list, tuple, ValuesView, ItemsView, pd.Series, DaskSeries)): - return True - if isinstance(l, (list, tuple, ValuesView, ItemsView, pd.Series)): - return True - if isinstance(l, np.ndarray) and l.ndim == 1: - return True - return False - - -def is_not_empty_list_like(l: ListOrTuple) -> bool: - return is_list_like(l) and len(l) > 0 - - -def is_empty_list_like(l: ListOrTuple) -> bool: - return not is_not_empty_list_like(l) - - -def assert_not_empty_list(l: List): - assert is_not_empty_list(l) - - -def assert_not_empty_list_like(l: ListOrTuple, error_message=""): - assert is_not_empty_list_like(l), error_message - - -def is_not_empty_list(l: List) -> bool: - return isinstance(l, list) and len(l) > 0 - - -def is_empty_list(l: List) -> bool: - return not is_not_empty_list(l) - - -def as_list(l) -> List: - if is_list_or_set_like(l): - return list(l) - return [l] - - -def list_pop_inplace(l: List, *, pop_condition: Callable) -> List: - assert isinstance(l, list) ## Needs to be a mutable - ## Iterate backwards to preserve indexes while iterating - for i in range(len(l) - 1, -1, -1): # Iterate backwards - if pop_condition(l[i]): - l.pop(i) ## Remove the item inplace - return l - - -def set_union(*args) -> Set: - _union: Set = set() - for s in args: - if isinstance(s, (pd.Series, np.ndarray)): - s: List = s.tolist() - s: Set = set(s) - _union: Set = _union.union(s) - return _union - - -def set_intersection(*args) -> Set: - _intersection: Optional[Set] = None - for s in args: - if isinstance(s, (pd.Series, np.ndarray)): - s: List = s.tolist() - s: Set = set(s) - if _intersection is None: - _intersection: Set = s - else: - _intersection: Set = _intersection.intersection(s) - return _intersection - - -def filter_string_list(l: List[str], pattern: str, ignorecase: bool = False) -> List[str]: - """ - Filter a list of strings based on an exact match to a regex pattern. Leaves non-string items untouched. - :param l: list of strings - :param pattern: Regex pattern used to match each item in list of strings. - Strings which are not a regex pattern will be expected to exactly match. - E.g. the pattern 'abcd' will only match the string 'abcd'. - To match 'abcdef', pattern 'abcd.*' should be used. - To match 'xyzabcd', patterm '.*abcd' should be used. - To match 'abcdef', 'xyzabcd' and 'xyzabcdef', patterm '.*abcd.*' should be used. - :param ignorecase: whether to ignore case while matching the pattern to the strings. - :return: filtered list of strings which match the pattern. - """ - if not pattern.startswith("^"): - pattern = "^" + pattern - if not pattern.endswith("$"): - pattern = pattern + "$" - flags = 0 - if ignorecase: - flags = flags | re.IGNORECASE - return [x for x in l if not isinstance(x, str) or len(re.findall(pattern, x, flags=flags)) > 0] - - -def keep_values( - a: Union[List, Tuple, Set, Dict], - values: Any, -) -> Union[List, Tuple, Set, Dict]: - values: Set = as_set(values) - if isinstance(a, list): - return list(x for x in a if x in values) - elif isinstance(a, tuple): - return tuple(x for x in a if x in values) - elif isinstance(a, set): - return set(x for x in a if x in values) - elif isinstance(a, dict): - return {k: v for k, v in a.items() if v in values} - raise NotImplementedError(f"Unsupported data structure: {type(a)}") - - -def remove_values( - a: Union[List, Tuple, Set, Dict], - values: Any, -) -> Union[List, Tuple, Set, Dict]: - values: Set = as_set(values) - if isinstance(a, list): - return list(x for x in a if x not in values) - elif isinstance(a, tuple): - return tuple(x for x in a if x not in values) - elif isinstance(a, set): - return set(x for x in a if x not in values) - elif isinstance(a, dict): - return {k: v for k, v in a.items() if v not in values} - raise NotImplementedError(f"Unsupported data structure: {type(a)}") - - -def remove_nulls( - a: Union[List, Tuple, Set, Dict], -) -> Union[List, Tuple, Set, Dict]: - if isinstance(a, list): - return list(x for x in a if is_not_null(x)) - elif isinstance(a, tuple): - return tuple(x for x in a if is_not_null(x)) - elif isinstance(a, set): - return set(x for x in a if is_not_null(x)) - elif isinstance(a, dict): - return {k: v for k, v in a.items() if is_not_null(v)} - raise NotImplementedError(f"Unsupported data structure: {type(a)}") - - -def elvis(d: Optional[Union[Dict, Any]], *args) -> Optional[Any]: - if len(args) == 0: - raise ValueError("Must pass non-empty list of keys to match when using elvis operator") - val: Union[Dict, Any] = get_default(d, {}) - for k in args: - val: Union[Dict, Any] = get_default(val, {}) - if isinstance(val, dict): - val: Union[Dict, Any] = val.get(k) - else: - return val - return val - - -## ======================== Tuple utils ======================== ## -def as_tuple(l) -> Tuple: - if is_list_or_set_like(l): - return tuple(l) - return (l,) - - -## ======================== Set utils ======================== ## -def is_set_like(l: Any) -> bool: - return isinstance(l, (set, frozenset, KeysView)) - - -def is_list_or_set_like(l: Union[List, Tuple, np.ndarray, pd.Series, Set, frozenset]): - return is_list_like(l) or is_set_like(l) - - -def get_subset(small_list: ListOrTuple, big_list: ListOrTuple) -> Set: - assert is_list_like(small_list) - assert is_list_like(big_list) - return set.intersection(set(small_list), set(big_list)) - - -def is_subset(small_list: ListOrTuple, big_list: ListOrTuple) -> bool: - return len(get_subset(small_list, big_list)) == len(small_list) - - -def as_set(s) -> Set: - if isinstance(s, set): - return s - if is_list_or_set_like(s): - return set(s) - return {s} - - -## ======================== Dict utils ======================== ## -def append_to_keys(d: Dict, prefix: Union[List[str], str] = "", suffix: Union[List[str], str] = "") -> Dict: - if not is_dict_like(d): - raise ValueError(f"Expected a dict-like object, found: {type(d)}") - keys = set(d.keys()) - for k in keys: - new_keys = ( - {f"{p}{k}" for p in as_list(prefix)} - | {f"{k}{s}" for s in as_list(suffix)} - | {f"{p}{k}{s}" for p in as_list(prefix) for s in as_list(suffix)} - ) - for k_new in new_keys: - d[k_new] = d[k] - return d - - -def transform_keys_case(d: Dict, case: Literal["lower", "upper"] = "lower"): - """ - Converts string dict keys to either uppercase or lowercase. Leaves non-string keys untouched. - :param d: dict to transform - :param case: desired case, either 'lower' or 'upper' - :return: dict with case-transformed keys - """ - if not is_dict_like(d): - raise ValueError(f"Expected a dict-like object, found: {type(d)}") - assert case in {"lower", "upper"} - out = {} - for k, v in d.items(): - if isinstance(k, str): - if case == "lower": - out[k.lower()] = v - elif case == "upper": - out[k.upper()] = v - else: - out[k] = v - return out - - -def transform_values_case(d: Dict, case: Literal["lower", "upper"] = "lower"): - """ - Converts string dict values to either uppercase or lowercase. Leaves non-string values untouched. - :param d: dict to transform - :param case: desired case, either 'lower' or 'upper' - :return: dict with case-transformed values - """ - if not is_dict_like(d): - raise ValueError(f"Expected a dict-like object, found: {type(d)}") - assert case in {"lower", "upper"} - out = {} - for k, v in d.items(): - if isinstance(v, str): - if case == "lower": - out[k] = v.lower() - elif case == "upper": - out[k] = v.upper() - else: - out[k] = v - return out - - -def dict_set_default(d: Dict, default_params: Dict) -> Dict: - """ - Sets default values in a dict for missing keys - :param d: input dict - :param default_params: dict of default values - :return: input dict with default values populated for missing keys - """ - if d is None: - d = {} - assert isinstance(d, dict) - if default_params is None: - return d - assert isinstance(default_params, dict) - for k, v in default_params.items(): - if isinstance(v, dict) and isinstance(d.get(k), dict): - ## We need to go deeper: - d[k] = dict_set_default(d[k], v) - else: - d.setdefault(k, v) - return d - - -def sorted_dict( - d: Dict, - *, - by: Literal["key", "value"] = "key", - reverse: bool = False, - order: Optional[List] = None, -) -> List[Tuple]: - assert by in {"key", "value"} - if order is not None: - order: List = as_list(order) - assert by == "key" - out_d: Dict = {} - for k in order: - ## In order - out_d[k] = d[k] - for k in set(d.keys()) - set(order): - ## Unordered - out_d[k] = d[k] - return list(out_d.items()) - else: - if by == "key": - return sorted(d.items(), key=lambda x: str(x[0]), reverse=reverse) - elif by == "value": - return sorted(d.items(), key=lambda x: str(x[1]), reverse=reverse) - else: - raise not_impl("by", by) - - -def dict_key_with_best_value( - d: Dict, - *, - how: Literal["max", "min"], -) -> Any: - assert how in {"max", "min"} - sorted_items: List[Tuple] = sorted_dict( - d, - by="value", - reverse={ - "min": False, - "max": True, - }[how], - ) - return sorted_items[0][0] - - -def filter_keys( - d: Dict, - keys: Union[List, Tuple, Set, str], - how: Literal["include", "exclude"] = "include", -) -> Dict: - """ - Filter values in a dict based on a list of keys. - :param d: dict to filter - :param keys: list of keys to include/exclude. - :param how: whether to keep or remove keys in filtered_keys list. - :return: dict with filtered list of keys - """ - if not is_dict_like(d): - raise ValueError(f"Expected a dict-like object, found: {type(d)}") - keys: Set = as_set(keys) - if how == "include": - return keep_keys(d, keys) - elif how == "exclude": - return remove_keys(d, keys) - else: - raise NotImplementedError(f'Invalid value for parameter `how`: "{how}"') - - -def filter_values( - struct: Union[List, Tuple, Set, Dict, str], - fn: Callable, - *, - raise_error: bool = True, -) -> Optional[Any]: - if (is_list_like(struct) or is_set_like(struct)) and len(struct) > 0: - return type(struct)([x for x in struct if fn(x)]) - elif is_dict_like(struct): - return dict({k: v for k, v in struct.items() if fn(v)}) - if raise_error: - raise ValueError(f"Unsupported structure: {type(struct)}") - return None - - -def keep_keys(d: Dict, keys: Union[List, Tuple, Set, str]) -> Dict: - keys: Set = as_set(keys) - return {k: d[k] for k in keys if k in d} - - -def remove_keys(d: Dict, keys: Union[List, Tuple, Set, str]) -> Dict: - keys: Set = as_set(keys) - return {k: d[k] for k in d if k not in keys} - - -class UniqueDict(dict): - def __setitem__(self, key, value): ## Dict which rejects updates for existing keys. - if key not in self: - dict.__setitem__(self, key, value) - else: - raise KeyError("Key already exists") - - -def convert_and_filter_keys_on_enum( - d: Dict, - AutoEnumClass: AutoEnum.__class__, - how: Literal["include", "exclude"] = "include", -) -> Dict: - """ - Filter values in a dict based on those matching an enum. - :param d: dict to filter. - :param AutoEnumClass: AutoEnum class on which to filter. - :param how: whether to keep or remove keys in the AutoEnum class. - :return: dict with filtered list of keys - """ - if not is_dict_like(d): - raise ValueError(f"Expected a dict-like object, found: {type(d)}") - if AutoEnumClass is None: - return {} - assert isinstance(AutoEnumClass, AutoEnum.__class__) - d = AutoEnumClass.convert_keys(d) - return filter_keys(d, list(AutoEnumClass), how=how) - - -def filter_keys_on_pattern( - d: Dict, - key_pattern: str, - ignorecase: bool = False, - how: Literal["include", "exclude"] = "include", -): - """ - Filter string keys in a dict based on a regex pattern. - :param d: dict to filter - :param key_pattern: regex pattern used to match keys. - :param how: whether to keep or remove keys. - Follows same rules as `filter_string_list` method, i.e. only checks string keys and retains non-string keys. - :return: dict with filtered keys - """ - keys: List = list(d.keys()) - filtered_keys: List = filter_string_list(keys, key_pattern, ignorecase=ignorecase) - return filter_keys(d, filtered_keys, how=how) - - -def is_not_empty_dict(d: Dict) -> bool: - return is_dict_like(d) and len(d) > 0 - - -def is_empty_dict(d: Dict) -> bool: - return not is_not_empty_dict(d) - - -def assert_not_empty_dict(d: Dict): - assert is_not_empty_dict(d) - - -def is_dict_like(d: Union[Dict, defaultdict]) -> bool: - return isinstance(d, (dict, defaultdict)) - - -def is_list_or_dict_like(d: Any) -> bool: - return is_list_like(d) or is_dict_like(d) - - -def is_list_of_dict_like(d: List[Dict]) -> bool: - if not is_list_like(d): - return False - for x in d: - if not is_dict_like(x): - return False - return True - - -def is_dict_like_or_list_of_dict_like(d: Union[Dict, List[Dict]]) -> bool: - if is_dict_like(d): - return True - elif is_list_like(d): - return is_list_of_dict_like(d) - return False - - -def eval_dict_values(params: Dict): - if not isinstance(params, dict): - raise ValueError(f"{params} should be of type dict") - updated_dict = {} - for parameter, value in params.items(): - try: - updated_dict[parameter] = literal_eval(value) - except Exception: - updated_dict[parameter] = value - return updated_dict - - -def invert_dict(d: Dict) -> Dict: - if not isinstance(d, dict): - raise ValueError(f"{d} should be of type dict") - d_inv: Dict = {v: k for k, v in d.items()} - if len(d_inv) != len(d): - raise ValueError("Dict is not invertible as values are not unique.") - return d_inv - - -def iter_dict(d, depth: int = 1, *, _cur_depth: int = 0): - """ - Recursively iterate over nested dictionaries and yield keys at each depth. - - :param d: The dictionary to iterate over. - :param depth: The current depth of recursion (used for tracking depth of keys). - :return: Yields tuples where the first elements are keys at different depths, and the last element is the value. - """ - assert isinstance(d, dict), f"Input must be a dictionary, found: {type(d)}" - assert isinstance(depth, int) and depth >= 1, "depth must be an integer (1 or more)" - - for k, v in d.items(): - if isinstance(v, dict) and _cur_depth < depth - 1: - # If the value is a dictionary, recurse - for subkeys in iter_dict(v, _cur_depth=_cur_depth + 1, depth=depth): - yield (k,) + subkeys - else: - # If the value is not a dictionary, yield the key-value pair - yield (k, v) - - -## ======================== Utils for multiple collections ======================== ## -def only_item( - d: Union[Dict, List, Tuple, Set, np.ndarray, pd.Series], - raise_error: bool = True, -) -> Union[Dict, List, Tuple, Set, np.ndarray, pd.Series, Any]: - if not (is_list_or_set_like(d) or is_dict_like(d)): - return d - if len(d) == 1: - if is_dict_like(d): - return next(iter(d.items())) - return next(iter(d)) - if raise_error: - raise ValueError(f"Expected input {type(d)} to have only one item; found {len(d)} elements.") - return d - - -def only_key(d: Dict, raise_error: bool = True) -> Union[Any]: - if not is_dict_like(d): - return d - if len(d) == 1: - return next(iter(d.keys())) - if raise_error: - raise ValueError(f"Expected input {type(d)} to have only one item; found {len(d)} elements.") - return d - - -def only_value(d: Dict, raise_error: bool = True) -> Union[Any]: - if not is_dict_like(d): - return d - if len(d) == 1: - return next(iter(d.values())) - if raise_error: - raise ValueError(f"Expected input {type(d)} to have only one item; found {len(d)} elements.") - return d - - -def is_1d_array(l: Union[List, Tuple]): - return is_list_like(l) and len(l) > 0 and not is_list_like(l[0]) - - -def is_2d_array(l: Union[List, Tuple]): - return is_list_like(l) and len(l) > 0 and is_list_like(l[0]) - - -def convert_1d_or_2d_array_to_dataframe(data: SeriesOrArray1DOrDataFrameOrArray2D) -> pd.DataFrame: - if is_1d_array(data): - data: pd.Series = convert_1d_array_to_series(data) - if isinstance(data, pd.Series) or is_2d_array(data): - data: pd.DataFrame = pd.DataFrame(data) - assert isinstance(data, pd.DataFrame) - return data - - -def convert_1d_array_to_series(data: SeriesOrArray1D): - if len(data) == 0: - raise ValueError("Cannot convert empty data structure to series") - if isinstance(data, pd.Series): - return data - if not is_list_like(data): - raise ValueError("Cannot convert non list-like data structure to series") - return pd.Series(data) - - -def flatten1d(l: Union[List, Tuple, Set, Any], output_type: Type = list) -> Union[List, Set, Tuple]: - assert output_type in {list, set, tuple} - if not is_list_or_set_like(l): - return l - out = [] - for x in l: - out.extend(as_list(flatten1d(x))) - return output_type(out) - - -def flatten2d( - l: Union[List, Tuple, Set, Any], - outer_type: Type = list, - inner_type: Type = tuple, -) -> Union[List, Tuple, Set, Any]: - assert outer_type in {list, set, tuple} - assert inner_type in {list, set, tuple} - if not is_list_or_set_like(l): - return l - out: List[Union[List, Set, Tuple]] = [flatten1d(x, output_type=inner_type) for x in l] - return outer_type(out) - - -def partial_sort( - struct: Union[List[Any], Tuple[Any]], - order: Union[List[Any], Tuple[Any], Any], -) -> Union[List[Any], Tuple[Any]]: - """ - Partially sorts a list or tuple. - """ - ## Dictionary to store the count of each element in order - order: List[Any] = as_list(order) - order_count: Dict[Any, int] = {item: 0 for item in order} - - # Two lists: one for elements in order and one for the rest - ordered_part: List[Any] = [] - rest_part: List[Any] = [] - - for item in struct: - if item in order_count: - # If the item is in order, increment the count and add to ordered_part - order_count[item] += 1 - else: - # Otherwise, add to rest_part - rest_part.append(item) - - ## Construct the final ordered part based on the count - for item in order: - ordered_part.extend([item] * order_count[item]) - - ## Combine the ordered part with the rest - out: List[Any] = ordered_part + rest_part - if isinstance(struct, tuple): - return tuple(out) - return out - - -def is_sorted(l: Union[List[Any], Tuple[Any, ...]], *, reverse: bool = False) -> bool: - assert isinstance(l, (list, tuple)) - length = len(l) - assert length > 0 - if length == 1: - return True - if reverse: - l: List[Any] = list(l)[::-1] - for x, x_next in zip(l[0 : length - 1], l[1:length]): - if x > x_next: - return False - return True - - -def get_unique(data: SeriesOrArray1DOrDataFrameOrArray2D, exclude_nans: bool = True) -> Set[Any]: - if data is None: - return set() - if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame): - data: np.ndarray = data.values - if is_2d_array(data): - data: np.ndarray = convert_1d_or_2d_array_to_dataframe(data).values - if not isinstance(data, np.ndarray): - data: np.ndarray = np.array(data) - flattened_data = data.ravel( - "K" - ) ## 1-D array of all data (w/ nans). Ref: https://stackoverflow.com/a/26977495 - if len(flattened_data) == 0: - return set() - if exclude_nans: - flattened_data = flattened_data[~pd.isnull(flattened_data)] - flattened_data = np.unique(flattened_data) - return set(flattened_data) - - -def any_item( - struct: Union[List, Tuple, Set, Dict, ValuesView, str], - *, - seed: Optional[int] = None, - raise_error: bool = True, -) -> Optional[Any]: - py_random: random.Random = random.Random(seed) - if (is_list_like(struct) or is_set_like(struct)) and len(struct) > 0: - return py_random.choice(tuple(struct)) - elif is_dict_like(struct): - k: Any = any_key(struct, seed=seed, raise_error=raise_error) - v: Any = struct[k] - return k, v ## Return an item - elif isinstance(struct, str): - return py_random.choice(struct) - if raise_error: - raise ValueError(f"Unsupported structure: {type(struct)}") - return None - - -def any_key(d: Dict, *, seed: Optional[int] = None, raise_error: bool = True) -> Optional[Any]: - py_random: random.Random = random.Random(seed) - if is_not_empty_dict(d): - return py_random.choice(sorted(list(d.keys()))) - if raise_error: - raise ValueError( - f"Expected input to be a non-empty dict; " - f"found {type(d) if not is_dict_like(d) else 'empty dict'}." - ) - return None - - -def any_value(d: Dict, *, seed: Optional[int] = None, raise_error: bool = True) -> Optional[Any]: - k: Any = any_key(d, seed=seed, raise_error=raise_error) - return d[k] - - -def first_item( - struct: Union[List, Tuple, Set, Dict, str], - *, - raise_error: bool = True, -) -> Optional[Any]: - if is_dict_like(struct): - k: Any = first_key(struct, raise_error=raise_error) - v: Any = struct[k] - return k, v ## Return an item - elif is_list_like(struct) or is_set_like(struct) or isinstance(struct, str): - return list(struct)[0] - if raise_error: - raise ValueError(f"Unsupported structure: {type(struct)}") - return None - - -def first_key(d: Dict, *, raise_error: bool = True) -> Optional[Any]: - if is_not_empty_dict(d): - return list(d.keys())[0] - if raise_error: - raise ValueError( - f"Expected input to be a non-empty dict; " - f"found {type(d) if not is_dict_like(d) else 'empty dict'}." - ) - return None - - -def first_value(d: Dict, *, raise_error: bool = True) -> Optional[Any]: - k: Any = first_key(d, raise_error=raise_error) - return d[k] - - -## ======================== Pandas utils ======================== ## -def get_num_non_null_columns_per_row(df: pd.DataFrame) -> pd.Series: - ## Ref: https://datascience.stackexchange.com/a/16801/35826 - assert isinstance(df, pd.DataFrame) - return (~df.isna()).sum(axis=1) - - -def get_max_num_non_null_columns_per_row(df: pd.DataFrame) -> int: - assert isinstance(df, pd.DataFrame) - return get_num_non_null_columns_per_row(df).max() - - -@contextmanager -def pd_display(**kwargs): - """ - Use pd.describe_option('display') to see all options. - """ - try: - from IPython.display import display - except ImportError: - display = print - set_param_from_alias(params=kwargs, param="max_rows", alias=["num_rows", "nrows", "rows"], default=None) - set_param_from_alias(params=kwargs, param="max_cols", alias=["num_cols", "ncols", "cols"], default=None) - set_param_from_alias( - params=kwargs, - param="max_colwidth", - alias=[ - "max_col_width", - "max_columnwidth", - "max_column_width", - "columnwidth", - "column_width", - "colwidth", - "col_width", - ], - default=None, - ) - set_param_from_alias(params=kwargs, param="vertical_align", alias=["valign"], default="top") - set_param_from_alias(params=kwargs, param="text_align", alias=["textalign"], default="left") - set_param_from_alias(params=kwargs, param="ignore_css", alias=["css"], default=False) - - max_rows: Optional[int] = kwargs.get("max_rows") - max_cols: Optional[int] = kwargs.get("max_cols") - max_colwidth: Optional[int] = kwargs.get("max_colwidth") - vertical_align: str = kwargs["vertical_align"] - text_align: str = kwargs["text_align"] - ignore_css: bool = kwargs["ignore_css"] - - # print(kwargs) - - def disp(df: pd.DataFrame): - css = [ - ## Align header to center - { - "selector": "th", - "props": [ - ("vertical-align", "center"), - ("text-align", "center"), - ("padding", "10px"), - ], - }, - ## Align cell to top and left/center - { - "selector": "td", - "props": [ - ("vertical-align", vertical_align), - ("text-align", text_align), - ("padding", "10px"), - ], - }, - ] - if not ignore_css and isinstance(df, pd.DataFrame): - df = df.style.set_table_styles(css) - display(df) - - with pd.option_context( - "display.max_rows", - max_rows, - "display.max_columns", - max_cols, - "max_colwidth", - max_colwidth, - "display.expand_frame_repr", - False, - ): - yield disp - - -def pd_partial_column_order(df: pd.DataFrame, columns: List) -> pd.DataFrame: - columns: List = as_list(columns) - df_columns: List = list(df.columns) - final_columns: List = [] - for col in columns: - if col not in df_columns: - raise ValueError(f'Column "{col}" not found in current {pd.DataFrame} columns: {df.columns}') - final_columns.append(col) - for col in df_columns: ## Add all the remaining columns - if col not in final_columns: - final_columns.append(col) - assert set(final_columns) == set(df_columns) - return df[final_columns] - - -## ======================== NumPy utils ======================== ## -def is_numpy_integer_array(data: Any) -> bool: - if not isinstance(data, np.ndarray): - return False - return issubclass(data.dtype.type, np.integer) - - -def is_numpy_float_array(data: Any) -> bool: - if not isinstance(data, np.ndarray): - return False - return issubclass(data.dtype.type, float) - - -def is_numpy_string_array(data: Any) -> bool: - if not isinstance(data, np.ndarray): - return False - return issubclass(data.dtype.type, str) - - -## Ref (from Pytorch tests): -## github.com/pytorch/pytorch/blob/e180ca652f8a38c479a3eff1080efe69cbc11621/torch/testing/_internal/common_utils.py#L349 -NUMPY_TO_TORCH_DTYPE_MAP = {} -with optional_dependency("torch"): - import torch - - NUMPY_TO_TORCH_DTYPE_MAP = { - np.bool_: torch.bool, - np.uint8: torch.uint8, - np.int8: torch.int8, - np.int16: torch.int16, - np.int32: torch.int32, - np.int64: torch.int64, - np.float16: torch.float16, - np.float32: torch.float32, - np.float64: torch.float64, - np.complex64: torch.complex64, - np.complex128: torch.complex128, - } - TORCH_TO_NUMPY_DTYPE_MAP = {v: k for k, v in NUMPY_TO_TORCH_DTYPE_MAP.items()} diff --git a/src/fmcore/util/language/_testing.py b/src/fmcore/util/language/_testing.py deleted file mode 100644 index 0fe5888..0000000 --- a/src/fmcore/util/language/_testing.py +++ /dev/null @@ -1,16 +0,0 @@ -from itertools import product -from typing import * - -from ._import import optional_dependency -from ._structs import flatten2d - -with optional_dependency("parameterized"): - from parameterized import parameterized - - def parameterized_name_func(test, _, param): - ## Ref: https://kracekumar.com/post/618264170735009792/parameterize-python-tests/ - return f"{test.__name__}_{parameterized.to_safe_name('_'.join([str(x) for x in param.args]))}" - - -def parameterized_flatten(*args) -> List: - return flatten2d(list(product(*args))) diff --git a/src/fmcore/util/language/_typing.py b/src/fmcore/util/language/_typing.py deleted file mode 100644 index 86aa9b7..0000000 --- a/src/fmcore/util/language/_typing.py +++ /dev/null @@ -1,620 +0,0 @@ -import functools -import inspect -import json -import typing -from abc import ABC -from typing import * - -import numpy as np -import typing_extensions -from pydantic import ( - BaseModel, - Extra, - Field, - constr, - create_model_from_typeddict, - root_validator, - validate_arguments, -) -from pydantic.fields import Undefined - -from ._autoenum import AutoEnum -from ._function import call_str_to_params, get_fn_spec, is_function, params_to_call_str -from ._string import NeverFailJsonEncoder, String -from ._structs import as_list, as_set, is_list_like -from ._utils import get_default - - -def type_str(data: Any) -> str: - if isinstance(data, type): - if issubclass(data, Parameters): - out: str = data.class_name - else: - out: str = str(data.__name__) - else: - out: str = str(type(data)) - ## Crocodile brackets mess up Aim's logging, they are treated as HTML tags. - out: str = out.replace("<", "").replace(">", "") - return out - - -def is_abstract(Class: Type) -> bool: - return ABC in Class.__bases__ - - -## Ref: https://stackoverflow.com/a/13624858/4900327 -class classproperty(property): - def __get__(self, obj, objtype=None): - return super(classproperty, self).__get__(objtype) - - def __set__(self, obj, value): - super(classproperty, self).__set__(type(obj), value) - - def __delete__(self, obj): - super(classproperty, self).__delete__(type(obj)) - - -def safe_validate_arguments(f): - names_to_fix = {n for n in BaseModel.__dict__ if not n.startswith("_")} - - @functools.wraps(f) - def wrapper(*args, **kwargs): - kwargs = {n[:-1] if n[:-1] in names_to_fix else n: v for n, v in kwargs.items()} - return f(*args, **kwargs) - - def _create_param(p: inspect.Parameter) -> inspect.Parameter: - default = Undefined if p.default is inspect.Parameter.empty else p.default - return p.replace(name=f"{p.name}_", default=Field(default, alias=p.name)) - - sig = inspect.signature(f) - sig = sig.replace( - parameters=[_create_param(p) if n in names_to_fix else p for n, p in sig.parameters.items()] - ) - - wrapper.__signature__ = sig - wrapper.__annotations__ = {f"{n}_" if n in names_to_fix else n: v for n, v in f.__annotations__.items()} - - try: - return validate_arguments( - wrapper, - config={ - "allow_population_by_field_name": True, - "arbitrary_types_allowed": True, - }, - ) - except Exception as e: - raise ValueError( - f"Error creating model for function {get_fn_spec(f).resolved_name}." - f"\nEncountered Exception: {String.format_exception_msg(e)}" - ) - - -def check_isinstance( - x: Optional[Any], y: Union[List[Type], Tuple[Type, ...], Type], raise_error: bool = True -): - if x is None and y is type(None): - return True - assert isinstance(y, type) or (isinstance(y, (list, tuple)) and np.all([isinstance(z, type) for z in y])) - if (isinstance(y, type) and isinstance(x, y)) or ( - isinstance(y, list) and np.any([isinstance(x, z) for z in y]) - ): - return True - if raise_error: - y_str: str = ", ".join([type_str(_y) for _y in as_list(y)]) - raise TypeError( - f"Input parameter must be of type `{y_str}`; found type `{type_str(x)}` with value:\n{x}" - ) - return False - - -def check_isinstance_or_none(x: Optional[Any], y: Type, raise_error: bool = True): - if x is None: - return True - return check_isinstance(x, y, raise_error=raise_error) - - -def check_issubclass_or_none(x: Optional[Any], y: Type, raise_error: bool = True): - if x is None: - return True - return check_issubclass(x, y, raise_error=raise_error) - - -def check_issubclass(x: Optional[Any], y: Type, raise_error: bool = True): - if x is None: - return False - assert isinstance(x, type) - assert isinstance(y, type) or (isinstance(y, list) and np.all([isinstance(z, type) for z in y])) - if (isinstance(y, type) and issubclass(x, y)) or ( - isinstance(y, list) and np.any([issubclass(x, z) for z in y]) - ): - return True - if raise_error: - raise TypeError( - f"Input parameter must be a subclass of type {str(y)}; found type {type(x)} with value {x}" - ) - return False - - -def get_classvars(cls) -> List[str]: - return [ - var_name - for var_name, typing_ in typing.get_type_hints(cls).items() - if typing_.__origin__ is typing.ClassVar - ] - - -def get_classvars_typing(cls) -> Dict[str, Any]: - return { - var_name: typing_.__args__[0] - for var_name, typing_ in typing.get_type_hints(cls).items() - if typing.get_origin(typing_) is typing.ClassVar - } - - -class Registry(ABC): - """ - A registry for subclasses. When a base class extends Registry, its subclasses will automatically be registered, - without any code in the base class to do so explicitly. - This coding trick allows us to maintain the Dependency Inversion Principle, as the base class does not have to - depend on any subclass implementation; in the base class code, we can instead retrieve the subclass in the registry - using a key, and then interact with the retrieved subclass using the base class interface methods (which we assume - the subclass has implemented as per the Liskov Substitution Principle). - - Illustrative example: - Suppose we have abstract base class AbstractAnimal. - This is registered as a registry via: - class AbstractAnimal(Parameters, Registry, ABC): - pass - Then, subclasses of AbstractAnimal will be automatically registered: - class Dog(AbstractAnimal): - name: str - Now, we can extract the subclass using the registered keys (of which the class-name is always included): - AbstractAnimalSubclass = AbstractAnimal.get_subclass('Dog') - dog = AbstractAnimalSubclass(name='Sparky') - - We can also set additional keys to register the subclass against: - class AnimalType(AutoEnum): - CAT = auto() - DOG = auto() - BIRD = auto() - - class Dog(AbstractAnimal): - aliases = [AnimalType.DOG] - - AbstractAnimalSubclass = AbstractAnimal.get_subclass(AnimalType.DOG) - dog = AbstractAnimalSubclass(name='Sparky') - - Alternately, the registry keys can be set using the _registry_keys() classmethod: - class Dog(AbstractAnimal): - @classmethod - def _registry_keys(cls) -> List[Any]: - return [AnimalType.DOG] - """ - - _registry: ClassVar[Dict[Any, Dict[str, Type]]] = {} ## Dict[key, Dict[classname, Class] - _registry_base_class: ClassVar[Optional[Type[BaseModel]]] = None - _classvars_typing_dict: ClassVar[Optional[Dict[str, Any]]] = None - _classvars_BaseModel: ClassVar[Optional[Type[BaseModel]]] = None - _allow_multiple_subclasses: ClassVar[bool] = False - _allow_subclass_override: ClassVar[bool] = False - _dont_register: ClassVar[bool] = False - aliases: ClassVar[Tuple[str, ...]] = tuple() - - def __init_subclass__(cls, **kwargs): - """ - Register any subclass with the base class. A child class is registered as long as it is imported/defined. - """ - super().__init_subclass__(**kwargs) - if cls in Registry.__subclasses__(): - ## The current class is a direct subclass of Registry (i.e. it is the base class of the hierarchy). - cls._registry: Dict[Any, Dict[str, Type]] = {} - cls._registry_base_class: Type = cls - cls.__set_classvars_typing() - else: - ## The current class is a subclass of a Registry-subclass, and is not abstract; register this. - if not is_abstract(cls) and not cls._dont_register: - cls._pre_registration_hook() - cls.__set_classvars_typing() - cls.__validate_classvars_BaseModel() - cls.__register_subclass() - - @classmethod - def __set_classvars_typing(cls): - classvars_typing_dict: Dict[str, Any] = { - var_name: typing_ - for var_name, typing_ in get_classvars_typing(cls).items() - if not var_name.startswith("_") - } - cls._classvars_typing_dict: ClassVar[Dict[str, Any]] = classvars_typing_dict - - class Config(Parameters.Config): - extra = Extra.ignore - - cls._classvars_BaseModel: ClassVar[Type[BaseModel]] = create_model_from_typeddict( - typing_extensions.TypedDict(f"{cls.__name__}_ClassVarsBaseModel", classvars_typing_dict), - warnings=False, - __config__=Config, - ) - - @classmethod - def __validate_classvars_BaseModel(cls): - ## Gives the impression of validating ClassVars on concrete subclasses in the hierarchy. - classvar_values: Dict[str, Any] = {} - for classvar, type_ in cls._classvars_typing_dict.items(): - if not hasattr(cls, classvar): - if ABC not in cls.__bases__: - ## Any concrete class must have all classvars set with values. - raise ValueError( - f'You must set a value for class variable "{classvar}" on subclass "{cls.__name__}".\n' - f'Custom type-hints might be one reason why "{classvar}" is not recognized. ' - f'If you have added custom type-hints, please try removing them and set "{classvar}" like so: `{classvar} = `' - ) - else: - classvar_value = getattr(cls, classvar) - if hasattr(type_, "__origin__"): - if ( - type_.__origin__ == typing.Union - and len(type_.__args__) == 2 - and type(None) in type_.__args__ - ): - ## It is something like Optional[str], Optional[List[str]], etc. - args = set(type_.__args__) - args.remove(type(None)) - classvar_type = next(iter(args)) - else: - classvar_type = type_.__origin__ - if classvar_type in {set, list, tuple} and classvar_value is not None: - classvar_value = classvar_type(as_list(classvar_value)) - classvar_values[classvar] = classvar_value - classvar_values: BaseModel = cls._classvars_BaseModel(**classvar_values) - for classvar, type_ in cls._classvars_typing_dict.items(): - if not hasattr(cls, classvar): - if ABC not in cls.__bases__: - ## Any concrete class must have all classvars set with values. - raise ValueError( - f'You must set a value for class variable "{classvar}" on subclass "{cls.__name__}".\n' - f'Custom type-hints might be one reason why "{classvar}" is not recognized. ' - f'If you have added custom type-hints, please try removing them and set "{classvar}" like so: `{classvar} = `' - ) - else: - setattr(cls, classvar, classvar_values.__getattribute__(classvar)) - - @classmethod - def _pre_registration_hook(cls): - pass - - @classmethod - def __register_subclass(cls): - subclass_name: str = str(cls.__name__).strip() - cls.__add_to_registry(subclass_name, cls) ## Always register subclass name - for k in set(as_list(cls.aliases) + as_list(cls._registry_keys())): - if k is not None: - cls.__add_to_registry(k, cls) - - @classmethod - @validate_arguments - def __add_to_registry(cls, key: Any, subclass: Type): - subclass_name: str = subclass.__name__ - if isinstance(key, (str, AutoEnum)): - ## Case-insensitive matching: - keys_to_register: List[str] = [String.str_normalize(key)] - elif isinstance(key, tuple): - keys_to_register: List[Tuple] = [ - tuple( - ## Case-insensitive matching: - String.str_normalize(k) if isinstance(k, (str, AutoEnum)) else k - for k in key - ) - ] - else: - keys_to_register: List[Any] = [key] - for k in keys_to_register: - if k not in cls._registry: - cls._registry[k] = {subclass_name: subclass} - continue - ## Key is in the registry - registered: Dict[str, Type] = cls._registry[k] - registered_names: Set[str] = set(registered.keys()) - assert len(registered_names) > 0, f"Invalid state: key {k} is registered to an empty dict" - if subclass_name in registered_names and cls._allow_subclass_override is False: - raise KeyError( - f"A subclass with name {subclass_name} is already registered against key {k} for registry under " - f"{cls._registry_base_class}; overriding subclasses is not permitted." - ) - elif subclass_name not in registered_names and cls._allow_multiple_subclasses is False: - assert len(registered_names) == 1, ( - f"Invalid state: _allow_multiple_subclasses is False but we have multiple subclasses registered " - f"against key {k}" - ) - raise KeyError( - f"Key {k} already is already registered to subclass {next(iter(registered_names))}; registering " - f"multiple subclasses to the same key is not permitted." - ) - cls._registry[k] = { - **registered, - ## Add or override the subclass names - subclass_name: subclass, - } - - @classmethod - def get_subclass( - cls, - key: Any, - raise_error: bool = True, - *args, - **kwargs, - ) -> Optional[Union[Type, List[Type]]]: - if isinstance(key, (str, AutoEnum)): - Subclass: Optional[Dict[str, Type]] = cls._registry.get(String.str_normalize(key)) - else: - Subclass: Optional[Dict[str, Type]] = cls._registry.get(key) - if Subclass is None: - if raise_error: - raise KeyError( - f'Could not find subclass of {cls} using key: "{key}" (type={type(key)}). ' - f"Available keys are: {set(cls._registry.keys())}" - ) - return None - if len(Subclass) == 1: - return next(iter(Subclass.values())) - return list(Subclass.values()) - - @classmethod - def subclasses(cls, keep_abstract: bool = False) -> Set[Type]: - available_subclasses: Set[Type] = set() - for k, d in cls._registry.items(): - for subclass in d.values(): - if subclass == cls._registry_base_class: - continue - if is_abstract(subclass) and keep_abstract is False: - continue - if isinstance(subclass, type) and issubclass(subclass, cls): - available_subclasses.add(subclass) - return available_subclasses - - @classmethod - def remove_subclass(cls, subclass: Union[Type, str]): - name: str = subclass - if isinstance(subclass, type): - name: str = subclass.__name__ - for k, d in cls._registry.items(): - for subclass_name, subclass in list(d.items()): - if String.str_normalize(subclass_name) == String.str_normalize(name): - d.pop(subclass_name, None) - - @classmethod - def _registry_keys(cls) -> Optional[Union[List[Any], Any]]: - return None - - -## Ref: https://stackoverflow.com/q/6760685/4900327, Method 2 base class. -## The metaclass method in the above link did not work well with multiple inheritance. -class Singleton: - __instance = None - - def __new__(cls, *args, **kwargs): - if not isinstance(cls.__instance, cls): - cls.__instance = super(Singleton, cls).__new__(cls) - return cls.__instance - - @classproperty - def instance(cls): - return cls.__instance - - -ParametersSubclass = TypeVar("ParametersSubclass", bound="Parameters") - - -class Parameters(BaseModel, ABC): - ## Ref on Pydantic + ABC: https://pydantic-docs.helpmanual.io/usage/models/#abstract-base-classes - ## Needed to work with Registry.alias...this needs to be on a subclass of `BaseModel`. - aliases: ClassVar[Tuple[str, ...]] = tuple() - dict_exclude: ClassVar[Tuple[str, ...]] = tuple() - - def __init__(self, *args, **kwargs): - try: - super().__init__(*args, **kwargs) - except Exception as e: - raise ValueError( - f'Cannot create Pydantic instance of type "{self.class_name}".' - f"\nEncountered exception: {String.format_exception_msg(e)}" - ) - - @classproperty - def class_name(cls) -> str: - return str(cls.__name__) ## Will return the child class name. - - @classmethod - def param_names(cls, **kwargs) -> Set[str]: - # superclass_params: Set[str] = set(super(Parameters, cls).schema(**kwargs)['properties'].keys()) - class_params: Set[str] = set(cls.schema(**kwargs)["properties"].keys()) - return class_params # .union(superclass_params) - - @classmethod - def param_default_values(cls, **kwargs) -> Dict: - return { - param: param_schema["default"] - for param, param_schema in cls.schema(**kwargs)["properties"].items() - if "default" in param_schema ## The default value might be None - } - - @classmethod - def _clear_extra_params(cls, params: Dict) -> Dict: - return {k: v for k, v in params.items() if k in cls.param_names()} - - def dict(self, *args, exclude: Optional[Any] = None, **kwargs) -> Dict: - exclude: Set[str] = as_set(get_default(exclude, [])).union(as_set(self.dict_exclude)) - return super(Parameters, self).dict(*args, exclude=exclude, **kwargs) - - def json(self, *args, encoder: Optional[Any] = None, indent: Optional[int] = None, **kwargs) -> str: - if encoder is None: - encoder = functools.partial(json.dumps, cls=NeverFailJsonEncoder, indent=indent) - return super(Parameters, self).json(*args, encoder=encoder, **kwargs) - - @classproperty - def _constructor(cls) -> ParametersSubclass: - return cls - - def __str__(self) -> str: - params_str: str = self.json(indent=4) - out: str = f"{self.class_name} with params:\n{params_str}" - return out - - class Config: - ## Ref for Pydantic mutability: https://pydantic-docs.helpmanual.io/usage/models/#faux-immutability - allow_mutation = False - ## Ref for Extra.forbid: https://pydantic-docs.helpmanual.io/usage/model_config/#options - extra = Extra.forbid - ## Ref for Pydantic private attributes: https://pydantic-docs.helpmanual.io/usage/models/#private-model-attributes - underscore_attrs_are_private = True - ## Validates default values. Ref: https://pydantic-docs.helpmanual.io/usage/model_config/#options - validate_all = True - ## Validates typing by `isinstance` check. Ref: https://pydantic-docs.helpmanual.io/usage/model_config/#options - arbitrary_types_allowed = True - - @staticmethod - def _convert_params(Class: Type[BaseModel], d: Union[Type[BaseModel], Dict]): - if type(d) == Class: - return d - if isinstance(d, BaseModel): - return Class(**d.dict(exclude=None)) - if d is None: - return Class() - if isinstance(d, dict): - return Class(**d) - raise NotImplementedError(f"Cannot convert object of type {type(d)} to {Class.__class__}") - - def update_params(self, **new_params) -> Generic[ParametersSubclass]: - ## Since Parameters class is immutable, we create a new one: - overidden_params: Dict = { - **self.dict(exclude=None), - **new_params, - } - return self._constructor(**overidden_params) - - def copy(self, **kwargs) -> Generic[ParametersSubclass]: - return super(Parameters, self).copy(**kwargs) - - def clone(self, **kwargs) -> Generic[ParametersSubclass]: - return self.copy(**kwargs) - - -class UserEnteredParameters(Parameters): - """ - Case-insensitive Parameters class. - Use this for configs classes where you expect to read from user-entered input, which might have any case. - IMPORTANT: the param names in the subclass must be in LOWERCASE ONLY. - Ref: https://github.com/samuelcolvin/pydantic/issues/1147#issuecomment-571109376 - """ - - @root_validator(pre=True) - def convert_params_to_lowercase(cls, params: Dict): - return {str(k).strip().lower(): v for k, v in params.items()} - - -class MutableParameters(Parameters): - class Config(Parameters.Config): - ## Ref on mutability: https://pydantic-docs.helpmanual.io/usage/models/#faux-immutability - allow_mutation = True - - -class MutableUserEnteredParameters(UserEnteredParameters, MutableParameters): - pass - - -class MappedParameters(Parameters, ABC): - """ - Allows creation of a Parameters instance by mapping from a dict. - From this dict, the 'name' key will be used to look up the cls._mapping dictionary, and retrieve the corresponding - class. This class will be instantiated using the other values in the dict. - """ - - _mapping: ClassVar[Dict[Union[Tuple[str, ...], str], Any]] - - class Config(Parameters.Config): - extra = Extra.allow - - name: constr(min_length=1) - args: Tuple = () - - def __init_subclass__(cls, **kwargs): - super().__init_subclass__(**kwargs) - if not isinstance(cls._mapping, dict) or len(cls._mapping) == 0: - raise ValueError(f"Lookup must be a non-empty dict; found: {cls._mapping}") - for key, val in list(cls._mapping.items()): - if is_list_like(key): - for k in key: - cls._mapping[String.str_normalize(k)] = val - else: - cls._mapping[String.str_normalize(key)] = val - - @root_validator(pre=True) - def check_mapped_params(cls, params: Dict) -> Dict: - if String.str_normalize(params["name"]) not in cls._mapping: - raise ValueError( - f'''`name`="{params["name"]}" was not found in the lookup. ''' - f"""Valid values for `name`: {set(cls._mapping.keys())}""" - ) - return params - - def dict(self, *args, exclude: Optional[Any] = None, **kwargs) -> Dict: - params: Dict = super(Parameters, self).dict(*args, exclude=exclude, **kwargs) - if exclude is not None and "name" in exclude: - params.pop("name", None) - else: - params["name"] = self.name - return params - - def __str__(self) -> str: - params_str: str = self.json(indent=4) - out: str = f"{self.class_name} with params:\n{params_str}" - return out - - @classmethod - def from_call_str(cls, call_str: str) -> Any: - args, kwargs = call_str_to_params(call_str) - return cls(args=args, **kwargs) - - def mapped_callable(self) -> Any: - return self._mapping[String.str_normalize(self.name)] - - @property - def kwargs(self) -> Dict: - return self.dict(exclude={"name", "args"} | set(self.dict_exclude)) - - def to_call_str(self) -> str: - args: List = list(self.args) - kwargs: Dict = self.kwargs - callable: Callable = self.mapped_callable() - if is_function(callable) or isinstance(callable, type): - callable_name: str = callable.__name__ - else: - callable_name: str = str(callable) - return params_to_call_str( - callable_name=callable_name, - args=args, - kwargs=kwargs, - ) - - @classmethod - @safe_validate_arguments - def of( - cls, - name: Optional[Union[Parameters, Dict, str]], - **params, - ) -> Optional[Any]: - if name is None: - return None - if isinstance(name, cls): - return name - if isinstance(name, dict): - return cls(**name) - if isinstance(name, str): - if "(" in name or ")" in name: - return cls.from_call_str(name) - else: - return cls(**{"name": name, **params}) - raise ValueError(f"Unsupported value for `name`: {name}") - - def initialize(self, **kwargs) -> Any: - return self.mapped_callable()(*self.args, **self.kwargs, **kwargs) diff --git a/src/fmcore/util/language/_utils.py b/src/fmcore/util/language/_utils.py deleted file mode 100644 index 3759987..0000000 --- a/src/fmcore/util/language/_utils.py +++ /dev/null @@ -1,122 +0,0 @@ -from typing import * - -import numpy as np -import pandas as pd -from pandas.api.types import is_scalar as pd_is_scalar - - -def get_default(*vals) -> Optional[Any]: - for x in vals: - if not is_null(x): - return x - return None - - -def unset(obj, attr_name: str, new_val: Any = None, delete: bool = True): - attr: Any = getattr(obj, attr_name) - setattr(obj, attr_name, new_val) - if delete: - del attr - - -def get_true(*vals) -> bool: - for x in vals: - if x is True: - return x - return False - - -if_else = lambda cond, x, y: (x if cond is True else y) ## Ternary operator -is_series = lambda x: isinstance(x, pd.Series) -is_df = lambda x: isinstance(x, pd.DataFrame) - - -## ======================== None utils ======================== ## -def any_are_none(*args) -> bool: - for x in args: - if x is None: - return True - return False - - -def all_are_not_none(*args) -> bool: - return not any_are_none(*args) - - -def all_are_none(*args) -> bool: - for x in args: - if x is not None: - return False - return True - - -def any_are_not_none(*args) -> bool: - return not all_are_none(*args) - - -def all_are_true(*args) -> bool: - for x in args: - assert x in {True, False} - if not x: ## Check for falsy values - return False - return True - - -def all_are_false(*args) -> bool: - for x in args: - assert x in {True, False} - if x: ## Check for truthy values - return False - return True - - -def none_count(*args) -> int: - none_count: int = 0 - for x in args: - if x is None: - none_count += 1 - return none_count - - -def not_none_count(*args) -> int: - return len(args) - none_count(*args) - - -def multiple_are_none(*args) -> bool: - return none_count(*args) >= 2 - - -def multiple_are_not_none(*args) -> bool: - return not_none_count(*args) >= 2 - - -def equal(*args) -> bool: - if len(args) == 0: - raise ValueError("Cannot find equality for zero arguments") - if len(args) == 1: - return True - first_arg = args[0] - for arg in args[1:]: - if arg != first_arg: - return False - return True - - -def is_scalar(x: Any, method: Literal["numpy", "pandas"] = "pandas") -> bool: - if method == "pandas": - ## Ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.api.types.is_scalar.html - ## Actual code: github.com/pandas-dev/pandas/blob/0402367c8342564538999a559e057e6af074e5e4/pandas/_libs/lib.pyx#L162 - return pd_is_scalar(x) - if method == "numpy": - ## Ref: https://numpy.org/doc/stable/reference/arrays.scalars.html#built-in-scalar-types - return np.isscalar(x) - raise NotImplementedError(f'Unsupported method: "{method}"') - - -is_null = lambda z: pd.isnull(z) if is_scalar(z) else (z is None) -is_not_null = lambda z: not is_null(z) - - -class Utility: - def __init__(self): - raise TypeError(f'Cannot instantiate utility class "{str(self.__class__)}"') diff --git a/src/fmcore/util/logging.py b/src/fmcore/util/logging.py deleted file mode 100644 index 678ad61..0000000 --- a/src/fmcore/util/logging.py +++ /dev/null @@ -1,241 +0,0 @@ -import logging -import os -import sys -import warnings -from contextlib import contextmanager -from typing import * - -import pandas as pd -from pydantic import FilePath, conint, constr -from pydantic.typing import Literal - -from fmcore.util.jupyter import JupyterNotebook -from fmcore.util.language import MutableParameters, String, binary_search, safe_validate_arguments - - -class Log(MutableParameters): - LOG_LEVEL_SUFFIX: ClassVar[str] = "" - DEBUG: ClassVar[str] = "DEBUG" - INFO: ClassVar[str] = "INFO" - WARNING: ClassVar[str] = "WARNING" - ERROR: ClassVar[str] = "ERROR" - FATAL: ClassVar[str] = "FATAL" - - LOG_LEVELS: ClassVar[Dict[str, int]] = { - f"{DEBUG}{LOG_LEVEL_SUFFIX}": logging.DEBUG, - f"{INFO}{LOG_LEVEL_SUFFIX}": logging.INFO, - f"{WARNING}{LOG_LEVEL_SUFFIX}": logging.WARNING, - f"{ERROR}{LOG_LEVEL_SUFFIX}": logging.ERROR, - f"{FATAL}{LOG_LEVEL_SUFFIX}": logging.FATAL, - } - LOG_LEVELS_REVERSE: ClassVar[Dict[int, str]] = { - logging.DEBUG: f"{DEBUG}{LOG_LEVEL_SUFFIX}", - logging.INFO: f"{INFO}{LOG_LEVEL_SUFFIX}", - logging.WARNING: f"{WARNING}{LOG_LEVEL_SUFFIX}", - logging.ERROR: f"{ERROR}{LOG_LEVEL_SUFFIX}", - logging.FATAL: f"{FATAL}{LOG_LEVEL_SUFFIX}", - } - ## Add new level names for our purposes to avoid getting logs from other libraries. - for custom_log_level_name, custom_log_level in LOG_LEVELS.items(): - logging.addLevelName(level=custom_log_level, levelName=custom_log_level_name) - - LOG_LEVEL: Literal[ - f"{DEBUG}{LOG_LEVEL_SUFFIX}", - f"{INFO}{LOG_LEVEL_SUFFIX}", - f"{WARNING}{LOG_LEVEL_SUFFIX}", - f"{ERROR}{LOG_LEVEL_SUFFIX}", - f"{FATAL}{LOG_LEVEL_SUFFIX}", - ] = f"{INFO}{LOG_LEVEL_SUFFIX}" - FILE_LOG_LEVEL: Literal[ - f"{DEBUG}{LOG_LEVEL_SUFFIX}", - f"{INFO}{LOG_LEVEL_SUFFIX}", - f"{WARNING}{LOG_LEVEL_SUFFIX}", - f"{ERROR}{LOG_LEVEL_SUFFIX}", - f"{FATAL}{LOG_LEVEL_SUFFIX}", - ] = f"{DEBUG}{LOG_LEVEL_SUFFIX}" - LOG_FILE_PATH: FilePath = None - LOG_FILE_LOGGER: Optional[logging.Logger] = None - IS_JUPYTER_NOTEBOOK: bool = JupyterNotebook.is_notebook() - - class Config(MutableParameters.Config): - arbitrary_types_allowed = True - - @safe_validate_arguments - def set_log_file( - self, - file_path: FilePath, - actor_name: Optional[constr(min_length=1, max_length=64)] = None, - ): - if self.LOG_FILE_LOGGER is not None: - raise RuntimeError( - f'Cannot set log file multiple times; already logging to "{self.LOG_FILE_PATH}"' - ) - if actor_name is not None: - formatter = logging.Formatter( - f"[{actor_name} @ %(asctime)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S UTC%z" - ) - else: - formatter = logging.Formatter("[%(asctime)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S UTC%z") - root_logger: logging.Logger = logging.getLogger() ## Gets root logger - root_logger.handlers[:] = [] ## Removes all existing handlers - file_handler: logging.Handler = logging.FileHandler(file_path, mode="a+") - file_handler.setFormatter(formatter) - root_logger.addHandler(file_handler) - root_logger.setLevel(self.LOG_LEVELS[f"{self.DEBUG}{self.LOG_LEVEL_SUFFIX}"]) - self.LOG_FILE_LOGGER = root_logger - self.LOG_FILE_PATH = file_path - - @safe_validate_arguments - def set_log_level(self, log_level: Literal[DEBUG, INFO, WARNING, ERROR, FATAL]): - log_level: str = String.assert_not_empty_and_strip(log_level).upper() + self.LOG_LEVEL_SUFFIX - self.LOG_LEVEL = log_level - - @safe_validate_arguments - def set_file_log_level(self, log_level: Literal[DEBUG, INFO, WARNING, ERROR, FATAL]): - log_level: str = String.assert_not_empty_and_strip(log_level).upper() + self.LOG_LEVEL_SUFFIX - self.FILE_LOG_LEVEL = log_level - - def log(self, *data, level: Union[str, int, float], flush: bool = False, **kwargs): - if isinstance(level, (int, float)): - ## Translate to our log level: - level: str = self.LOG_LEVELS_REVERSE[ - binary_search( - list(self.LOG_LEVELS_REVERSE.keys()), - target=level, - return_tuple=True, - )[0] - ] ## E.g. level=23 returns (DEBUG=20, WARN=30), we should pick DEBUG (lower of the two). - data_str: str = " ".join([self.to_log_str(x) for x in data]) - ## print at the appropriate level: - if self.LOG_LEVELS[self.LOG_LEVEL] <= self.LOG_LEVELS[level]: - ## Logs to both stdout and file logger if setup: - if self.IS_JUPYTER_NOTEBOOK: - from IPython.display import display - - for x in data: - if isinstance(x, (pd.DataFrame, pd.Series)): - display(x) - else: - print(self.to_log_str(x), end="", flush=flush) - print("", flush=flush) - else: - print(data_str, flush=flush) - - if ( - self.LOG_FILE_LOGGER is not None - and self.LOG_LEVELS[self.FILE_LOG_LEVEL] <= self.LOG_LEVELS[level] - ): - self.LOG_FILE_LOGGER.log( - ## We log to file at debug level: - level=self.LOG_LEVELS[f"{self.DEBUG}{self.LOG_LEVEL_SUFFIX}"], - msg=data_str, - ) - - def debug(self, *data, **kwargs): - self.log(*data, level=f"{self.DEBUG}{self.LOG_LEVEL_SUFFIX}", **kwargs) - - def info(self, *data, **kwargs): - self.log(*data, level=f"{self.INFO}{self.LOG_LEVEL_SUFFIX}", **kwargs) - - def warning(self, *data, **kwargs): - self.log(*data, level=f"{self.WARNING}{self.LOG_LEVEL_SUFFIX}", **kwargs) - - def error(self, *data, **kwargs): - self.log(*data, level=f"{self.ERROR}{self.LOG_LEVEL_SUFFIX}", **kwargs) - - def fatal(self, *data, **kwargs): - self.log(*data, level=f"{self.FATAL}{self.LOG_LEVEL_SUFFIX}", **kwargs) - - @classmethod - def to_log_str(cls, data: Any, *, df_num_rows: conint(ge=1) = 10) -> str: - if isinstance(data, str): - return data - if isinstance(data, dict): - return "\n" + String.jsonify(data) - if isinstance(data, (list, set, frozenset, tuple)): - return "\n" + String.pretty(data, max_width=int(1e6)) - if isinstance(data, (pd.Series, pd.DataFrame)): - if len(data) <= df_num_rows: - return "\n" + str(data.to_markdown()) - return ( - "\n" - + str(data.head(df_num_rows // 2).to_markdown()) - + f"\n...({len(data) - df_num_rows} more rows)...\n" - + str(data.tail(df_num_rows // 2).to_markdown()) - ) - return String.pretty(data, max_width=int(1e6)) - - -Log: Log = Log() ## Creates a singleton - - -@contextmanager -def ignore_warnings(): - pd_chained_assignment: Optional[str] = pd.options.mode.chained_assignment # default='warn' - with warnings.catch_warnings(): ## Ref: https://stackoverflow.com/a/14463362 - warnings.simplefilter("ignore") - ## Stops Pandas SettingWithCopyWarning in output. Ref: https://stackoverflow.com/a/20627316 - pd.options.mode.chained_assignment = None - yield - pd.options.mode.chained_assignment = pd_chained_assignment - - -@contextmanager -def ignore_stdout(): - devnull = open(os.devnull, "w") - stdout = sys.stdout - sys.stdout = devnull - try: - yield - finally: - sys.stdout = stdout - - -@contextmanager -def ignore_stderr(): - devnull = open(os.devnull, "w") - stderr = sys.stderr - sys.stderr = devnull - try: - yield - finally: - sys.stderr = stderr - - -@contextmanager -def ignore_stdout_and_stderr(): - with ignore_stdout(): - with ignore_stderr(): - yield - - -@contextmanager -def ignore_warnings_and_stdout(): - with ignore_warnings(): - with ignore_stdout(): - with ignore_stderr(): - yield - - -@contextmanager -def ignore_logging(disable_upto: int = logging.CRITICAL): - prev_disable_level: int = logging.root.manager.disable - logging.disable(disable_upto + 1) - try: - yield - finally: - logging.disable(prev_disable_level) - - -@contextmanager -def ignore_all_output(): - with ignore_stdout(): - with ignore_warnings(): - with ignore_stderr(): - with ignore_logging(): - yield - - -@contextmanager -def ignore_nothing(): - yield diff --git a/src/fmcore/util/notify.py b/src/fmcore/util/notify.py deleted file mode 100644 index 5db0762..0000000 --- a/src/fmcore/util/notify.py +++ /dev/null @@ -1,170 +0,0 @@ -from abc import ABC, abstractmethod -from datetime import datetime -from typing import * - -import requests -from pydantic import BaseModel, constr, root_validator - -from fmcore.constants.DataProcessingConstants import Status -from fmcore.util.language import Parameters, Registry, String, get_default, safe_validate_arguments - -Notifier = "Notifier" -NotifierSubclass = TypeVar("NotifierSubclass", bound="Notifier") - - -class Notifier(Parameters, Registry, ABC): - name: constr(min_length=1) - - @root_validator(pre=True) - def convert_params(cls, params: Dict): - params["name"] = cls.class_name - return params - - @abstractmethod - def send(self, msg: Union[constr(min_length=1), int, float, BaseModel], **kwargs) -> bool: - pass - - @classmethod - def of( - cls, - notifier: Optional[Union[Notifier, Dict, str]] = None, - **kwargs, - ) -> NotifierSubclass: - if isinstance(notifier, Notifier): - return notifier - if notifier is None and "name" in kwargs: - notifier = kwargs.pop("name") - if isinstance(notifier, dict): - return cls.of(**notifier) - if isinstance(notifier, str): - if notifier is not None: - NotifierClass: Type[Notifier] = Notifier.get_subclass(notifier) - else: - NotifierClass: Type[Notifier] = cls - if NotifierClass == Notifier: - raise ValueError( - f'"{Notifier.class_name}" is an abstract class. ' - f"To create an instance, please either pass `notifier`, " - f'or call .of(...) on a subclass of "{Notifier.class_name}".' - ) - try: - return NotifierClass(**kwargs) - except Exception as e: - raise ValueError( - f"Cannot create notifier with kwargs:\n{kwargs}\nError: {String.format_exception_msg(e)}" - ) - raise NotImplementedError( - f"Unsupported value for `notifier`; found {type(notifier)} with following value:\n{notifier}" - ) - - def pending(self, msg: Optional[str] = None, **kwargs) -> bool: - msg: str = self._create_msg(status=Status.PENDING, msg=msg, **kwargs) - return self.send(msg, **kwargs) - - def running(self, msg: Optional[str] = None, **kwargs) -> bool: - msg: str = self._create_msg(status=Status.RUNNING, msg=msg, **kwargs) - return self.send(msg, **kwargs) - - def success(self, msg: Optional[str] = None, **kwargs) -> bool: - msg: str = self._create_msg(status=Status.SUCCEEDED, msg=msg, **kwargs) - return self.send(msg, **kwargs) - - def failed(self, msg: Optional[str] = None, **kwargs) -> bool: - msg: str = self._create_msg(status=Status.FAILED, msg=msg, **kwargs) - return self.send(msg, **kwargs) - - def stopped(self, msg: Optional[str] = None, **kwargs) -> bool: - msg: str = self._create_msg(status=Status.STOPPED, msg=msg, **kwargs) - return self.send(msg, **kwargs) - - @classmethod - def _create_msg( - cls, - status: Status, - *, - msg: Optional[str] = None, - start_dt: Optional[datetime] = None, - now: Optional[datetime] = None, - raise_error: bool = False, - **kwargs, - ) -> str: - if status is Status.SUCCEEDED: - out: str = "Succeeded" - elif status is Status.FAILED: - out: str = "Failed" - elif status is Status.STOPPED: - out: str = "Stopped" - elif status is Status.RUNNING: - out: str = "Running" - elif status is Status.PENDING: - out: str = "Pending" - else: - raise NotImplementedError(f"Unsupported status: {status}") - now: datetime = get_default(now, datetime.now()) - now: datetime = now.replace(tzinfo=now.astimezone().tzinfo) - out += f" at {String.readable_datetime(now, human=True)}" - if msg is not None: - out = f"[{out}] {msg}" - if start_dt is not None: - start_dt: datetime = start_dt.replace(tzinfo=start_dt.astimezone().tzinfo) - try: - out += f" ({String.readable_seconds((now - start_dt))} elapsed)" - except Exception as e: - if raise_error: - raise e - pass - out: str = out.strip() + "." - return out - - -class NoopNotifier(Notifier): - aliases = ["noop"] - - @safe_validate_arguments - def send(self, msg: Union[constr(min_length=1), int, float, BaseModel], **kwargs) -> bool: - return True ## Do nothing - - -class ChimeNotifier(Notifier): - aliases = ["chime"] - - webhook: constr(min_length=10, max_length=1024, regex="^.*hooks.chime.aws.*$", strip_whitespace=True) - - @safe_validate_arguments - def send( - self, - msg: Union[constr(min_length=1), int, float, BaseModel], - priority: bool = True, - markdown: bool = True, - **kwargs, - ) -> bool: - if isinstance(msg, BaseModel): - msg: str = f"```\n{msg.json(indent=4)}\n```" - markdown: bool = True - msg: str = str(msg) - if priority: - msg = ( - "@All " + msg - ) ## Will notify those with "Normal" and "Full" notification settings for a room. - if markdown: - msg = "/md\n\n" + msg - response = requests.post(url=self.webhook, json={"Content": msg}) - return str(response.status_code).startswith("2") - - -class DiscordNotifier(Notifier): - aliases = ["discord"] - - webhook: constr( - min_length=10, max_length=1024, regex="^.*discord.com/api/webhooks/.*$", strip_whitespace=True - ) - - @safe_validate_arguments - def send(self, msg: Union[constr(min_length=1), int, float, BaseModel], **kwargs) -> bool: - if isinstance(msg, BaseModel): - msg: str = f"```\n{msg.json(indent=4)}\n```" - msg: str = str(msg) - response = requests.post( - self.webhook, json={"content": msg}, headers={"Content-Type": "application/json"} - ) - return str(response.status_code).startswith("2") diff --git a/src/fmcore/util/profiling.py b/src/fmcore/util/profiling.py deleted file mode 100644 index d4b130a..0000000 --- a/src/fmcore/util/profiling.py +++ /dev/null @@ -1,252 +0,0 @@ -import math -import time -from datetime import datetime, timedelta -from typing import * - -from pydantic import confloat, root_validator - -from fmcore.util.language import Alias, MutableParameters, Parameters, String, set_param_from_alias -from fmcore.util.logging import Log - - -def measure_time_ms(fn: Callable) -> Tuple[Any, float]: - start: float = time.perf_counter() - output: Any = fn() - end: float = time.perf_counter() - return output, 1000 * (end - start) - - -class TimerError(Exception): - """A custom exception used to report errors in use of Timer class""" - - pass - - -class Timer(Parameters): - task: str - logger: Optional[Callable] = Log.info - silent: bool = (False,) - single_line: bool = False ## Single-line printing - i: Optional[int] = None - max_i: Optional[int] = None - _start_dt: Optional[datetime] = None - _start_time_ns: Optional[int] = None - _end_dt: Optional[datetime] = None - _end_time_ns: Optional[int] = None - - def __init__(self, task: str = "", **kwargs): - super(Timer, self).__init__(task=task, **kwargs) - - @root_validator(pre=True) - def _set_timer_params(cls, params: Dict) -> Dict: - set_param_from_alias(params, param="logger", alias=["log"]) - Alias.set_silent(params, default=False) - if "logger" in params and params["logger"] in {None, False}: - params["logger"]: Optional[Callable] = None - return params - - @property - def has_started(self) -> bool: - return self._start_time_ns is not None - - @property - def has_stopped(self) -> bool: - return self._end_time_ns is not None - - def time_taken(self, format: str) -> Union[timedelta, int, float, str]: - if format in {str, "str", "string"}: - return self.time_taken_str - elif format in {"s", "sec", "seconds"}: - return self.time_taken_sec - elif format in {"ms", "milli", "millis", "millisec", "milliseconds"}: - return self.time_taken_ms - elif format in {"us", "micro", "micros", "microsec", "microseconds"}: - return self.time_taken_us - elif format in {"ns", "nano", "nanos", "nanosec", "nanoseconds"}: - return self.time_taken_ns - elif format in {"dt", "td", "datetime", "timedelta"}: - return self.time_taken_td - raise NotImplementedError(f"Unsupported `format` with type {type(format)} and value: {format}") - - @property - def start_datetime(self) -> datetime: - return self._start_dt - - @property - def end_datetime(self) -> datetime: - return self._end_dt - - @property - def start_time_str(self) -> str: - return String.readable_datetime(self._start_dt) - - @property - def end_time_str(self) -> str: - return String.readable_datetime(self._end_dt) - - @property - def time_taken_str(self) -> str: - return String.readable_seconds(self.time_taken_sec, decimals=2) - - @property - def time_taken_human(self) -> str: - return String.readable_seconds(self.time_taken_sec, decimals=2) - - @property - def time_taken_sec(self) -> float: - return self.time_taken_ns / 1e9 - - @property - def time_taken_ms(self) -> float: - return self.time_taken_ns / 1e6 - - @property - def time_taken_us(self) -> float: - return self.time_taken_ns / 1e3 - - @property - def time_taken_ns(self) -> int: - self._check_started() - if self.has_stopped: - return self._end_time_ns - self._start_time_ns - return time.perf_counter_ns() - self._start_time_ns - - @property - def time_taken_td(self) -> timedelta: - ## Python timedelta does not have nanosecond resolution: https://github.com/python/cpython/issues/59648 - return timedelta(microseconds=self.time_taken_us) - - def _check_started(self): - if not self.has_started: - raise TimerError("Timer has not been started. Use .start() to start it.") - - def _check_not_started(self): - if self.has_started: - raise TimerError(f"Timer has already been started at {String.readable_datetime(self._start_dt)}") - - def _check_stopped(self): - if not self.has_stopped: - raise TimerError("Timer has not been stopped. Use .stop() to stop it.") - - def _check_not_stopped(self): - if self.has_stopped: - raise TimerError(f"Timer has already been stopped at {String.readable_datetime(self._end_dt)}") - - def start(self): - self._check_not_started() - self._start_time_ns = time.perf_counter_ns() - now: datetime = datetime.now() - now: datetime = now.replace(tzinfo=now.astimezone().tzinfo) - self._start_dt = now - if self.should_log and not self.single_line: - self.logger(self._start_msg()) - - def alert(self, text: Optional[str] = None): - self._check_started() - self._check_not_stopped() - if self.should_log: - self.logger(self._alert_msg(text)) - - def stop(self): - self._check_not_stopped() - self._end_time_ns: int = time.perf_counter_ns() - now: datetime = datetime.now() - now: datetime = now.replace(tzinfo=now.astimezone().tzinfo) - self._end_dt = now - if self.should_log: - self.logger(self._end_msg()) - - @property - def should_log(self) -> bool: - return self.logger is not None and self.silent is False - - def __enter__(self): - """Start a new timer as a context manager""" - self.start() - return self - - def __exit__(self, *exc_info): - """Stop the context manager timer, report elapsed time.""" - self.stop() - - def _start_msg(self) -> str: - out: str = "" - out += self._task_msg() - out += self._idx_msg() - out += f"Started at {String.readable_datetime(self._start_dt)}..." - return out - - def _alert_msg(self, text: Optional[str] = None) -> str: - out: str = "" - out += self._task_msg() - out += self._idx_msg() - out += f"Timer has been running for {String.readable_seconds(self.time_taken_sec, decimals=2)}." - if isinstance(text, str): - out += f" {text}" - return out - - def _end_msg(self) -> str: - out: str = "" - out += self._task_msg() - out += self._idx_msg() - if self.single_line: - out += ( - f"Started at {String.readable_datetime(self._start_dt)}, " - f"completed in {String.readable_seconds(self.time_taken_sec, decimals=2)}." - ) - return out - out += f"...completed in {String.readable_seconds(self.time_taken_sec, decimals=2)}." - return out - - def _task_msg(self) -> str: - out: str = "" - if len(self.task) > 0: - out += f"({self.task}) " - return out - - def _idx_msg(self) -> str: - out: str = "" - if self.i is not None and self.max_i is not None: - out += ( - f"[{String.pad_zeros(i=self.i + 1, max_i=self.max_i)}/" - f"{String.pad_zeros(i=self.max_i, max_i=self.max_i)}] " - ) - elif self.i is not None: - out += f"[{self.i}] " - return out - - -class Timeout(MutableParameters): - timeout: confloat(gt=0) ## In seconds. - last_used_time: float = time.time() - - @property - def has_expired(self) -> bool: - return self.last_used_time + self.timeout < time.time() - - def reset_timeout(self): - self.last_used_time: float = time.time() - - -class Timeout1Min(Timeout): - timeout: confloat(gt=0, le=60) - - -class Timeout15Min(Timeout): - timeout: confloat(gt=0, le=60 * 15) - - -class Timeout1Hr(Timeout): - timeout: confloat(gt=0, le=60 * 60) - - -class Timeout24Hr(Timeout): - timeout: confloat(gt=0, le=60 * 60 * 24) - - -class TimeoutNever(Timeout): - timeout: float = math.inf - - -class Timeout1Week(Timeout): - timeout: confloat(gt=0, le=60 * 60 * 24 * 7) diff --git a/src/fmcore/util/schema.py b/src/fmcore/util/schema.py deleted file mode 100644 index 3006e6c..0000000 --- a/src/fmcore/util/schema.py +++ /dev/null @@ -1,753 +0,0 @@ -import re -from typing import * - -import numpy as np -from pydantic import conint, constr, root_validator - -from fmcore.constants.MLConstants import ( - DATA_ML_TYPES, - GROUND_TRUTH_ML_TYPES, - PREDICTED_ML_TYPES, - MLType, - MLTypeSchema, -) - -## These must be imported separately from the file since we are in the same dir. -from fmcore.util.language import ( - Parameters, - String, - as_list, - as_set, - assert_not_empty_dict, - get_default, - is_empty_list_like, - is_list_like, - keep_keys, - remove_keys, - safe_validate_arguments, -) - -ColTemplate = "ColTemplate" -SchemaTemplate = "Schema" -Schema = "Schema" - -INDEX_COL_TEMPLATE_KEY: str = "index_col" -INDEX_COL_NAME_TEMPLATE: str = "{" + INDEX_COL_TEMPLATE_KEY + "}" -INDEX_COL_DEFAULT_NAME: str = "id" - - -class ColTemplate(Parameters): - template: constr(min_length=1) - args: Tuple[constr(min_length=1), ...] - regex: re.Pattern - - def __hash__(self): - return hash(str(self.template)) - - def __str__(self): - return str(self.template) - - @classmethod - def of(cls, template: str, regex_fill: str = ".+?", regex_flags: int = re.IGNORECASE) -> ColTemplate: - return ColTemplate( - template=template, - args=tuple(String.str_format_args(template)), - regex=re.compile(cls.as_regex(template, fill=regex_fill), flags=regex_flags), - ) - - @classmethod - def template_is_unfilled(cls, template: str) -> bool: - return ( - template.find("{") != -1 and template.find("}") != -1 and template.find("{") < template.find("}") - ) - - @classmethod - def as_regex(cls, template: str, fill: str = ".+?") -> str: - return template.format(**{arg: fill for arg in String.str_format_args(template)}) - - def populate( - self, - *, - allow_unfilled: bool = False, - **kwargs, - ) -> Optional[Union[List[str], str]]: - kwargs: Dict[str, Any] = keep_keys(kwargs, self.args) - iterable_args: Set = set() - non_iterable_args: Set = set() - for arg, val in kwargs.items(): - if isinstance(val, (range, list, set, tuple, np.ndarray)): - iterable_args.add(arg) - else: - non_iterable_args.add(arg) - if len(iterable_args) == 0: - col: str = self.template.format(**kwargs) - if self.template_is_unfilled(col): - if not allow_unfilled: - raise ValueError( - f"Column is templatized even after populating arguments. " - f'Column template: "{self.template}"; ' - f'column after populating: "{col}"; ' - f"detected args: {String.str_format_args(col)}; " - f"kwargs: {kwargs}" - ) - return None - return col - else: - if len(non_iterable_args) > 0: - partial_template: str = self.template.format( - **{arg: val for arg, val in kwargs if arg in non_iterable_args} - ) - else: - partial_template: str = self.template - cols: List[str] = [partial_template] - for arg in iterable_args: - vals: Tuple = tuple(kwargs[arg]) - cols_temp = [] - for col in cols: - for val in vals: - cols_temp.append(col.format(**{arg: val})) - cols: List[str] = cols_temp - filtered_cols: List[str] = [] - for col in cols: - if self.template_is_unfilled(col): - if not allow_unfilled: - raise ValueError( - f"Column is templatized even after populating arguments. " - f'Column template: "{self.template}"; ' - f'column after populating: "{col}"; ' - f"detected args: {String.str_format_args(col)}; " - f"kwargs: {kwargs}" - ) - else: - filtered_cols.append(col) - cols = filtered_cols - return cols - - def matches(self, cols: Union[List, Tuple, Set, Any]) -> Set[str]: - cols: Set[str] = as_set(cols) - return set(col for col in cols if self.regex.match(str(col)) is not None) - - -class SchemaTemplate(Parameters): - index_col_template: ColTemplate - predictions_schema_template: Dict[ColTemplate, MLType] - ground_truths_schema_template: Dict[ColTemplate, MLType] - features_schema_template: Dict[ColTemplate, MLType] - - @property - def has_features(self) -> bool: - return self.features_schema_template != {} - - @property - def has_ground_truths(self) -> bool: - return self.ground_truths_schema_template != {} - - @property - def has_predictions(self) -> bool: - return self.predictions_schema_template != {} - - @classmethod - def from_parts( - cls, - index_col_template: Optional[str] = None, - ground_truths_schema_template: Optional[MLTypeSchema] = None, - predictions_schema_template: Optional[MLTypeSchema] = None, - features_schema_template: Optional[MLTypeSchema] = None, - ) -> Optional[SchemaTemplate]: - def _to_schema_template(schema: MLTypeSchema) -> Dict[ColTemplate, MLType]: - schema_template_part: Dict[ColTemplate, MLType] = {} - for col, mltype in schema.items(): - if mltype in set.union(GROUND_TRUTH_ML_TYPES, PREDICTED_ML_TYPES).union({MLType.INDEX}): - raise ValueError( - f"Schema template should have MLTypes like {DATA_ML_TYPES}, not {mltype}" - ) - schema_template_part[ColTemplate.of(col)] = mltype - return schema_template_part - - ## Set index column: - if index_col_template is None: - index_col_template: str = INDEX_COL_NAME_TEMPLATE - index_col_template: ColTemplate = ColTemplate.of(index_col_template) - ## Set ground truths: - ground_truths_schema_template: Dict[ColTemplate, MLType] = _to_schema_template( - MLType.convert_values(get_default(ground_truths_schema_template, {})) - ) - ## Set predictions: - predictions_schema_template: Dict[ColTemplate, MLType] = _to_schema_template( - MLType.convert_values(get_default(predictions_schema_template, {})) - ) - ## Set features: - features_schema_template: Dict[ColTemplate, MLType] = _to_schema_template( - MLType.convert_values(get_default(features_schema_template, {})) - ) - return cls( - index_col_template=index_col_template, - predictions_schema_template=predictions_schema_template, - ground_truths_schema_template=ground_truths_schema_template, - features_schema_template=features_schema_template, - ) - - @safe_validate_arguments - def infer_from_mltype_schema( - self, - schema: Union[Dict, Any], - *, - index_col: Optional[constr(min_length=1)] = None, - infer_features: bool = True, - infer_ground_truths: bool = True, - infer_predictions: bool = True, - has_features: bool = False, - has_ground_truths: bool = False, - has_predictions: bool = False, - ) -> Schema: - if isinstance(schema, Schema): - raise ValueError(f"Please call {Schema.class_name}.of(...)") - if not isinstance(schema, dict): - raise ValueError( - f"Expected schema to be a dict of MLTypes; " - f"found schema of type {type(schema)} with value: {schema}" - ) - - ## Might have MLType.GROUND_TRUTH, MLType.PREDICTED: - mltype_schema: MLTypeSchema = MLType.convert_values(schema, raise_error=True) - - ## Set index column: - if index_col is None: - ## index_col must either be passed explicitly, or be present in the schema. - index_col: Optional[str] = Schema.filter_index(schema, allow_missing=True) - if index_col is None: - raise ValueError( - f"Passed schema must have exactly one index column, but None found. Schema:\n{schema}" - ) - if len(self.index_col_template.matches({index_col})) == 0: - raise ValueError( - f'Passed schema has index column "{index_col}", ' - f"which does not match index_col_template: {self.index_col_template}" - ) - - if infer_ground_truths is False: - schema: MLTypeSchema = remove_keys(schema, GROUND_TRUTH_ML_TYPES) - - if infer_predictions is False: - schema: MLTypeSchema = remove_keys(schema, PREDICTED_ML_TYPES) - - ## Will have MLType.CATEGORICAL, MLType.FLOAT, etc. instead of MLType.GROUND_TRUTH, MLType.PREDICTED: - inferred_schema_from_cols: Schema = self.infer_from_columns( - set(mltype_schema.keys()), - index_col=index_col, - infer_features=infer_features, - infer_ground_truths=infer_ground_truths, - infer_predictions=infer_predictions, - has_features=has_features, - has_predictions=has_predictions, - has_ground_truths=has_ground_truths, - ) - inferred_col_mltypes: MLTypeSchema = inferred_schema_from_cols.flatten() - - ## Set ground-truths: - ground_truths_schema: MLTypeSchema = Schema.filter_schema( - data_schema=schema, - mltypes=GROUND_TRUTH_ML_TYPES, - ) - if len(ground_truths_schema) == 0: - ground_truths_schema: MLTypeSchema = inferred_schema_from_cols.ground_truths_schema - if has_ground_truths and len(ground_truths_schema) == 0: - raise ValueError( - f"Expected at least one ground-truth column (having MLType in {GROUND_TRUTH_ML_TYPES}), " - f"but none were found in schema: {schema}" - ) - ground_truths_schema: MLTypeSchema = { - col: inferred_col_mltypes.get(col, schema[col]) for col, mltype in ground_truths_schema.items() - } - - ## Set predictions: - predictions_schema: MLTypeSchema = Schema.filter_schema( - data_schema=schema, - mltypes=PREDICTED_ML_TYPES, - ) - if len(predictions_schema) == 0: - predictions_schema: MLTypeSchema = inferred_schema_from_cols.predictions_schema - if has_predictions and len(predictions_schema) == 0: - raise ValueError( - f"Expected at least one predicted column (having MLType in {PREDICTED_ML_TYPES}), " - f"but none were found in schema: {schema}" - ) - predictions_schema: MLTypeSchema = { - col: inferred_col_mltypes.get(col, schema[col]) for col, mltype in predictions_schema.items() - } - - ## Set features: - features_schema: MLTypeSchema = inferred_schema_from_cols.features_schema - if has_features and len(features_schema) == 0: - raise ValueError(f"Expected at least one feature column, but none were found in schema: {schema}") - features_schema: MLTypeSchema = { - col: inferred_col_mltypes.get(col, schema[col]) for col, mltype in features_schema.items() - } - - ## Merge remaining columns into features schema: - cols_so_far: Set[str] = ( - {index_col}.union(set(features_schema.keys())) - .union(set(predictions_schema.keys())) - .union(set(ground_truths_schema.keys())) - ) - remaining_schema: MLTypeSchema = { - col: mltype for col, mltype in schema.items() if col not in cols_so_far - } - features_schema: MLTypeSchema = {**remaining_schema, **features_schema} - - inferred_schema: Schema = Schema( - index_col=index_col, - features_schema=features_schema, - predictions_schema=predictions_schema, - ground_truths_schema=ground_truths_schema, - ) - assert inferred_schema.columns_set == set(schema.keys()) - return inferred_schema - - @safe_validate_arguments - def infer_from_columns( - self, - columns: Union[List, Tuple, Set], - *, - index_col: Optional[constr(min_length=1)] = None, - infer_features: bool = True, - infer_ground_truths: bool = True, - infer_predictions: bool = True, - has_features: bool = False, - has_ground_truths: bool = False, - has_predictions: bool = False, - ) -> Schema: - ## Note: it might not be possible to infer schema for all columns based on their name alone. - columns_set: Set = as_set(columns) - schema: Dict[str, Union[MLTypeSchema, str]] = {} - flat_schema: MLTypeSchema = {} - ## If infer_* is False, has_* should also become False: - has_features: bool = has_features and infer_features - has_ground_truths: bool = has_ground_truths and infer_ground_truths - has_predictions: bool = has_predictions and infer_predictions - - schema_template_parts = [] - ## This ordering is important: - if infer_predictions: - schema_template_parts.append(("predictions_schema", self.predictions_schema_template)) - if infer_ground_truths: - schema_template_parts.append(("ground_truths_schema", self.ground_truths_schema_template)) - if infer_features: - schema_template_parts.append(("features_schema", self.features_schema_template)) - - for schema_key, schema_template_part in schema_template_parts: - schema_key_schema: MLTypeSchema = {} - for col_template, mltype in schema_template_part.items(): - for col in col_template.matches(columns_set): - if col == index_col: - continue - if flat_schema.get(col, mltype) != mltype: - raise ValueError( - f'Conflict during schema inference; column "{col}" is assigned to MLType {flat_schema[col]}, ' - f"but it also matches pattern {col_template.regex}, which is assigned to MLType {mltype} " - f"as per the following schema template:\n{schema_template_part}" - ) - flat_schema[col] = mltype - schema_key_schema[col] = mltype - if ( - schema_key == "features_schema" - and has_features - and len(schema_key_schema) == 0 - and len(schema_template_part) > 0 - ): - raise ValueError( - f"Input columns {columns_set} did not match any feature column templates: " - f"{self.features_schema_template}" - ) - if ( - schema_key == "ground_truths_schema" - and has_ground_truths - and len(schema_key_schema) == 0 - and len(schema_template_part) > 0 - ): - raise ValueError( - f"Input columns {columns_set} did not match any ground-truth column templates: " - f"{self.ground_truths_schema_template}" - ) - if ( - schema_key == "predictions_schema" - and has_predictions - and len(schema_key_schema) == 0 - and len(schema_template_part) > 0 - ): - raise ValueError( - f"Input columns {columns_set} did not match any predicted column templates: " - f"{self.predictions_schema_template}" - ) - schema[schema_key] = schema_key_schema - - if index_col is None: - ## index_col must either be passed explicitly, or matchable. - index_col: Set[str] = set( - col - for col in self.index_col_template.matches(columns_set) ## Select matching columns... - if col not in flat_schema ## ...except those in "flat_schema". - ) - if len(index_col) == 0: - raise ValueError( - f"Did not match any index columns from {columns_set}; please explicitly pass `index_col`" - ) - if len(index_col) > 1: - raise ValueError( - f"Expected only one in {columns_set} to match index pattern {self.index_col_template.regex}; " - f"found {len(index_col)} matching index columns: {index_col}" - ) - index_col: str = next(iter(index_col)) - flat_schema[index_col] = MLType.INDEX - schema["index_col"] = index_col - inferred_schema: Schema = Schema(**schema) - assert inferred_schema.columns_set <= columns_set - return inferred_schema - - def populate( - self, - allow_unfilled: bool = False, - features: bool = True, - ground_truths: bool = True, - predictions: bool = True, - **kwargs, - ) -> Schema: - ## Populate index col: - if ( - self.index_col_template.template == INDEX_COL_NAME_TEMPLATE - and INDEX_COL_TEMPLATE_KEY not in kwargs - ): - kwargs[INDEX_COL_TEMPLATE_KEY] = INDEX_COL_DEFAULT_NAME - index_col: str = self.index_col_template.populate(allow_unfilled=False, **kwargs) - - features_schema: MLTypeSchema = {} - if features: - features_schema: MLTypeSchema = self._populate_templates_dict( - self.features_schema_template, allow_unfilled=allow_unfilled, **kwargs - ) - ground_truths_schema: MLTypeSchema = {} - if ground_truths: - ground_truths_schema: MLTypeSchema = self._populate_templates_dict( - self.ground_truths_schema_template, allow_unfilled=allow_unfilled, **kwargs - ) - - predictions_schema: MLTypeSchema = {} - if predictions: - predictions_schema: MLTypeSchema = self._populate_templates_dict( - self.predictions_schema_template, allow_unfilled=allow_unfilled, **kwargs - ) - return Schema( - index_col=index_col, - predictions_schema=predictions_schema, - ground_truths_schema=ground_truths_schema, - features_schema=features_schema, - ) - - @classmethod - def _populate_templates_dict( - cls, templates_dict: Dict[ColTemplate, MLType], allow_unfilled: bool, **kwargs - ) -> MLTypeSchema: - schema: MLTypeSchema = {} - for col_template, mltype in templates_dict.items(): - col: Optional[Union[List[str], str]] = col_template.populate( - allow_unfilled=allow_unfilled, **kwargs - ) - if col is None or is_empty_list_like(col): - continue - if isinstance(col, str): - schema[col] = mltype - elif is_list_like(col): - for c in col: - schema[c] = mltype - return schema - - -class Schema(Parameters): - index_col: str - features_schema: MLTypeSchema = {} - predictions_schema: MLTypeSchema = {} - ground_truths_schema: MLTypeSchema = {} - - @root_validator(pre=True) - def _set_schema_params(cls, params: Dict) -> Dict: - try: - ground_truths_schema: MLTypeSchema = MLType.convert_values(params.get("ground_truths_schema", {})) - if len(set(ground_truths_schema.values()).intersection(GROUND_TRUTH_ML_TYPES)) > 0: - raise ValueError( - f"Cannot have any of the following MLTypes in `ground_truths_schema`: {GROUND_TRUTH_ML_TYPES}; " - f"found following: {Schema.filter_schema(data_schema=ground_truths_schema, mltypes=GROUND_TRUTH_ML_TYPES)}. " - f'Please instead use the "data" MLTypes: {DATA_ML_TYPES}' - ) - params["ground_truths_schema"] = ground_truths_schema - - predictions_schema: MLTypeSchema = MLType.convert_values(params.get("predictions_schema", {})) - if len(set(predictions_schema.values()).intersection(PREDICTED_ML_TYPES)) > 0: - raise ValueError( - f"Cannot have any of the following MLTypes in `predictions_schema`: {PREDICTED_ML_TYPES}; " - f"found following: {Schema.filter_schema(data_schema=predictions_schema, mltypes=PREDICTED_ML_TYPES)}. " - f'Please instead use the "data" MLTypes: {DATA_ML_TYPES}' - ) - params["predictions_schema"] = predictions_schema - - params["features_schema"] = MLType.convert_values(params.get("features_schema", {})) - return params - except Exception as e: - raise ValueError(String.format_exception_msg(e)) - - def index(self) -> str: - return self.index_col - - def features(self) -> MLTypeSchema: - return self.features_schema - - def predictions(self) -> MLTypeSchema: - return self.predictions_schema - - def ground_truths(self) -> MLTypeSchema: - return self.ground_truths_schema - - def flatten(self) -> MLTypeSchema: - return { - self.index_col: MLType.INDEX, - **self.features_schema, - **self.ground_truths_schema, - **self.predictions_schema, - } - - @property - def columns_set(self) -> Set[str]: - return set(self.flatten().keys()) - - @property - def columns(self) -> List[str]: - return sorted(list(self.columns_set)) - - @property - def has_features(self) -> bool: - return self.features_schema != {} - - @property - def has_ground_truths(self) -> bool: - return self.ground_truths_schema != {} - - @property - def has_predictions(self) -> bool: - return self.predictions_schema != {} - - def rename(self, columns: Union[Dict, Callable]) -> Schema: - if isinstance(columns, dict): - col_mapper: Callable = lambda col: columns.get(col, col) - else: - col_mapper: Callable = columns - return Schema( - index_col=col_mapper(self.index_col), - features_schema={col_mapper(col): mltype for col, mltype in self.features_schema.items()}, - predictions_schema={col_mapper(col): mltype for col, mltype in self.predictions_schema.items()}, - ground_truths_schema={ - col_mapper(col): mltype for col, mltype in self.ground_truths_schema.items() - }, - ) - - @staticmethod - def of( - schema: Union[Schema, MLTypeSchema], - schema_template: SchemaTemplate, - *, - index_col: Optional[constr(min_length=1)] = None, - infer_features: bool = True, - infer_ground_truths: bool = True, - infer_predictions: bool = True, - has_features: bool = False, - has_ground_truths: bool = False, - has_predictions: bool = False, - ) -> Optional[Schema]: - if isinstance(schema, SchemaTemplate): - raise ValueError( - f"Cannot instantiate `{Schema.class_name} from an instance of `{SchemaTemplate.class_name}`." - ) - if isinstance(schema, Schema): - return schema - if isinstance(schema, dict) and set(schema.keys()) <= Schema.param_names(): - ## All keys from schema_template are match variable names in Schema class - return Schema(**schema) - if not isinstance(schema, dict): - raise ValueError(f"Unsupported creation of {Schema.class_name} from data: {schema}") - - ## We have an MLTypeSchema dict: - return schema_template.infer_from_mltype_schema( - schema, - index_col=index_col, - infer_features=infer_features, - infer_ground_truths=infer_ground_truths, - infer_predictions=infer_predictions, - has_features=has_features, - has_ground_truths=has_ground_truths, - has_predictions=has_predictions, - ) - - def set_features(self, features_schema: MLTypeSchema, override: bool = False) -> Schema: - if self.has_features and override is False: - raise ValueError( - f"`features_schema` already set and cannot be overridden on {self.class_name}. " - f"Current schema: \n{self}" - ) - return Schema(**{**self.dict(), "features_schema": features_schema}) - - def drop_features(self) -> Schema: - return Schema(**self.dict(exclude={"features_schema"})) - - def set_predictions(self, predictions_schema: MLTypeSchema, override: bool = False) -> Schema: - if self.has_predictions and override is False: - raise ValueError( - f"`predictions_schema` already set and cannot be overridden on {self.class_name}. " - f"Current schema: \n{self}" - ) - return Schema(**{**self.dict(), "predictions_schema": predictions_schema}) - - def drop_predictions(self) -> Schema: - return Schema(**self.dict(exclude={"predictions_schema"})) - - def predictions_to_features(self) -> Schema: - return self.drop_predictions().set_features( - {**self.features_schema, **self.predictions_schema}, - override=True, - ) - - def set_ground_truths(self, ground_truths_schema: MLTypeSchema, override: bool = False) -> Schema: - if self.has_ground_truths and override is False: - raise ValueError( - f"`ground_truths_schema` already set and cannot be overridden on {self.class_name}. " - f"Current schema: \n{self}" - ) - return Schema(**{**self.dict(), "ground_truths_schema": ground_truths_schema}) - - def drop_ground_truths(self) -> Schema: - return Schema(**self.dict(exclude={"ground_truths_schema"})) - - def ground_truths_to_features(self) -> Schema: - return self.drop_ground_truths().set_features( - {**self.features_schema, **self.ground_truths_schema}, - override=True, - ) - - def keep_columns(self, cols: Union[List, Tuple, Set]) -> Schema: - cols: Set = as_set(cols) - schema: Schema = self - ## We always keep index column, so do not check that. - schema: Schema = self.set_features(keep_keys(schema.features_schema, cols), override=True) - schema: Schema = self.set_ground_truths(keep_keys(schema.ground_truths_schema, cols), override=True) - schema: Schema = self.set_predictions(keep_keys(schema.predictions_schema, cols), override=True) - return schema - - def remove_columns(self, cols: Union[List, Tuple, Set]) -> Schema: - cols: Set = as_set(cols) - schema: Schema = self - if schema.index_col in cols: - raise ValueError(f'Cannot drop index column "{schema.index_col}".') - schema: Schema = self.set_features(remove_keys(schema.features_schema, cols), override=True) - schema: Schema = self.set_ground_truths(remove_keys(schema.ground_truths_schema, cols), override=True) - schema: Schema = self.set_predictions(remove_keys(schema.predictions_schema, cols), override=True) - return schema - - @staticmethod - @safe_validate_arguments - def filter_df( - df: Any, - data_schema: Optional[Union[MLTypeSchema, List[str]]] = None, - allow_missing: bool = False, - return_series: bool = True, - sort_columns: bool = True, - **kwargs, - ) -> Optional[Any]: - if data_schema is None: - return df[sorted(list(df.columns))] - if isinstance(data_schema, dict): - cols_set: KeysView = data_schema.keys() - else: - cols_set: Set = as_set(data_schema) - if allow_missing: - cols: List = [col for col in df.columns if col in cols_set] - else: - cols: List = [col for col in data_schema] - if sort_columns: - cols: List = sorted(cols) - if return_series and len(cols) == 1: - cols: str = cols[0] - else: - cols: List = as_list(cols) - return df[cols] - - @staticmethod - def remove_missing_columns(cols: List[str], data_schema: MLTypeSchema) -> MLTypeSchema: - common_cols: Set[str] = set.intersection(as_set(cols), set(data_schema.keys())) - return {col: data_schema[col] for col in common_cols} - - @classmethod - def filter_index(cls, data_schema: Optional[MLTypeSchema], allow_missing: bool = False) -> Optional[str]: - if data_schema is None: - return None - return cls.filter_single_column( - data_schema, - mltype=MLType.INDEX, - allow_missing=allow_missing, - ) - - @classmethod - @safe_validate_arguments - def filter_single_column( - cls, - data_schema: MLTypeSchema, - mltype: Union[Set[MLType], MLType], - allow_missing: bool = False, - ) -> Optional[str]: - cols: MLTypeSchema = cls.filter_schema( - data_schema=data_schema, - mltypes={mltype}, - expected_num_cols=None, - ) - if len(cols) == 0 and allow_missing: - return None - if len(cols) != 1: - raise ValueError( - f"Only expected one column with the following MLType(s): {mltype}; " - f"found {len(cols)} columns: {cols}" - ) - return next(iter(cols)) - - @classmethod - @safe_validate_arguments - def filter_schema_columns( - cls, - data_schema: MLTypeSchema, - mltypes: Union[Set[MLType], MLType], - expected_num_cols: Optional[conint(ge=1)] = None, - ) -> List[str]: - cols: List[str] = list( - cls.filter_schema( - data_schema=data_schema, mltypes=mltypes, expected_num_cols=expected_num_cols - ).keys() - ) - cols: List[str] = sorted(cols) - return cols - - @classmethod - @safe_validate_arguments - def filter_schema( - cls, - data_schema: MLTypeSchema, - mltypes: Union[Set[MLType], Tuple[MLType], List[MLType], MLType], - expected_num_cols: Optional[conint(ge=1)] = None, - **kwargs, - ) -> MLTypeSchema: - assert_not_empty_dict(data_schema) - mltypes: List[MLType] = as_list(mltypes) - filtered_schema: MLTypeSchema = { - col: mltype for col, mltype in data_schema.items() if mltype in mltypes - } - if expected_num_cols is not None: - if len(filtered_schema) != expected_num_cols: - raise ValueError( - f"Only expected {expected_num_cols} column(s) with the following MLType(s): {mltypes}; " - f"found {len(filtered_schema)} columns: {sorted(list(filtered_schema.keys()))}" - ) - return filtered_schema diff --git a/src/fmcore/util/struct.py b/src/fmcore/util/struct.py deleted file mode 100644 index 39dfeed..0000000 --- a/src/fmcore/util/struct.py +++ /dev/null @@ -1,138 +0,0 @@ -import gc -from typing import * - -from pydantic import conint - -from fmcore.util.language import MutableParameters, ProgressBar, as_list, is_list_like, set_param_from_alias - - -class Trie(MutableParameters): - parent: Optional["Trie"] = None - value: Optional[Any] = None - children: Dict[str, "Trie"] = dict() - _depth: Optional[conint(ge=0)] = None - _max_child_depth: Optional[conint(ge=0)] = None - _num_children_in_subtree: Optional[conint(ge=0)] = None - - def __repr__(self): - return str(self) - - def __str__(self): - out: str = f"{self.class_name}(" - if self.value is not None: - out += f"value={self.value}, " - out += f"depth={self.depth}, num_children={self.num_children}" - if self.has_children: - out += f", children={set(self.children.keys())}" - out += ")" - return out - - @property - def depth(self) -> int: - """Calculates and returns depth of the current node. Root has depth of 0.""" - if self._depth is None: - if self.parent is None: - self._depth = 0 - else: - self._depth = self.parent.depth + 1 - return self._depth - - @property - def root(self) -> "Trie": - cur_node: Trie = self - while self.parent is not None: - cur_node: Trie = self.parent - return cur_node - - @property - def has_children(self) -> bool: - return self.num_children > 0 - - @property - def num_children(self) -> int: - return len(self.children) - - @property - def num_nodes(self) -> int: - return self.root.num_children_in_subtree - - @property - def num_children_in_subtree(self) -> int: - if self._num_children_in_subtree is None: - if self.has_children is False: - self._num_children_in_subtree = 0 - else: - self._num_children_in_subtree: int = ( - sum([child.num_children_in_subtree for child in self.children.values()]) - + self.num_children - ) - return self._num_children_in_subtree - - @property - def max_child_depth(self) -> int: - if self._max_child_depth is None: - if not self.has_children: - self._max_child_depth: int = self.depth - else: - self._max_child_depth: int = max([child.max_child_depth for child in self.children.values()]) - return self._max_child_depth - - @property - def max_depth(self) -> int: - return self.root.max_child_depth - - def __getitem__(self, key): - return self.children[key] - - @classmethod - def of( - cls, - nodes: Union[List[List[str]], List[str], str], - splitter: Optional[str] = None, - allow_end_at_branch: bool = True, - **kwargs, - ) -> Any: - """ - Creates a trie from a list of strings. - Each node in the trie is a dict with further subdicts. Leafs are identified as dicts with '__end__' in them. - Ref: https://stackoverflow.com/a/11016430 - """ - if isinstance(nodes, (str, set)): - nodes: List[str] = as_list(nodes) - - assert is_list_like(nodes) - set_param_from_alias(params=kwargs, param="progress_bar", alias=["progress", "pbar"], default=True) - pbar: ProgressBar = ProgressBar.of( - kwargs.get("progress_bar"), - miniters=1000, - total=len(nodes), - prefer_kwargs=True, - ) - - trie_root: Trie = Trie() - try: - for node_i, node in enumerate(nodes): - if isinstance(node, str): - if splitter is None: - raise ValueError("When passing nodes as a list of strings, please pass `splitter`.") - node: List[str] = node.split(splitter) - current_node: Trie = trie_root - # print(f'\ncreating: {node}') - for node_part_i, node_part in enumerate(node): - # print(f'\t{node_part}') - if node_part not in current_node.children: - ## For a path like 'A/B/C/D', create intermediate: - current_node.children[node_part] = Trie(parent=current_node) - current_node_child: Trie = current_node.children[node_part] - if allow_end_at_branch is False and node_part_i != len(node) - 1: - if len(current_node_child.children) == 0: - raise ValueError( - f"Branch nodes cannot be values for this Trie; thus cannot create trie from {node}" - ) - current_node: Trie = current_node_child - pbar.update(1) - if node_i % 10_000 == 0: - gc.collect() - finally: - gc.collect() - return trie_root diff --git a/tests/test_imports.py b/tests/test_imports.py index ba8bcfd..7adfdf6 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -10,7 +10,7 @@ def session_fixture(): yield # Teardown code: Clean up resources after all tests have run - from fmcore.util.concurrency._asyncio import _cleanup_event_loop + from bears.util.concurrency._asyncio import _cleanup_event_loop print("Tearing down resources after the test session.", end="") _cleanup_event_loop() @@ -18,4 +18,6 @@ def session_fixture(): def test_import_main_module(): - pass + import fmcore + + assert fmcore._LIBRARY_NAME == "fmcore"