From 9561ad6ec67e7f390806a216e935c9cd9c77ff7f Mon Sep 17 00:00:00 2001 From: Mina Farid Date: Fri, 9 Nov 2018 17:26:13 -0500 Subject: [PATCH] update requirements --- README.md | 203 ++++++++-------------------------- __init__.py | 4 + repair/featurize/langmodel.py | 2 +- requirements.txt | 3 +- tests/test_holoclean.py | 16 ++- 5 files changed, 67 insertions(+), 161 deletions(-) diff --git a/README.md b/README.md index d7f4df6c8..9c75036f4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # HoloClean: A Machine Learning System for Data Enrichment -[HoloClean](www.holoclean.io) is build ontop of PyTorch and Postgres. +[HoloClean](www.holoclean.io) is built on top of PyTorch and Postgres. HoloClean is a statistical inference engine to impute, clean, and enrich data. As a weakly supervised machine learning system, HoloClean leverages available @@ -19,8 +19,8 @@ predictions, and insights form noisy, incomplete, and erroneous data. #### Ubuntu Install Postgres by running -```sh -apt-get install postgresql postgresql-contrib +```bash +$ apt-get install postgresql postgresql-contrib ``` #### MacOS @@ -31,48 +31,52 @@ Installation instructions can be found at ### 2. Setup Postgres for HoloClean To start the Postgres console from your terminal -```sh -psql --user # or you can omit --user to use current user +```bash +$ psql --user # you can omit --user to use current user ``` -We then create a database `holo` and user `holocleanuser` (default settings for HoloClean) +We then create a database `holo` and user `holo` (default settings for HoloClean) ``` CREATE DATABASE holo; -CREATE USER holocleanuser; -ALTER USER holocleanuser WITH PASSWORD 'abcd1234'; -GRANT ALL PRIVILEGES ON DATABASE holo TO holocleanuser; +CREATE USER holo; +ALTER USER holo WITH PASSWORD 'clean'; +GRANT ALL PRIVILEGES ON DATABASE holo TO holo; \c holo -ALTER SCHEMA public OWNER TO holocleanuser; +ALTER SCHEMA public OWNER TO holo; ``` -In general, to connect to the `holo` database from the Postgres console +In general, to connect to the `holo` database from the Postgres psql console ``` \c holo ``` HoloClean currently populates the database `holo` with auxiliary and meta tables. -To clear the database simply connect as a root user or as `holocleanuser` and run +To clear the database simply connect as a root user or as `holo` and run ``` DROP DATABASE holo; CREATE DATABASE holo; ``` -### 3. Install HoloClean +### 3. Set up HoloClean -#### Option 1: pip and conda (recommended) +#### Virtual Environment + +##### Option 1: Set up a conda Virtual Environment + +Install Conda using one of the following methods ##### Ubuntu For **32-bit machines** run -```sh -wget https://repo.continuum.io/archive/Anaconda-2.3.0-Linux-x86.sh -sh Anaconda-2.3.0-Linux-x86.sh +```bash +$ wget https://repo.continuum.io/archive/Anaconda-2.3.0-Linux-x86.sh +$ sh Anaconda-2.3.0-Linux-x86.sh ``` For **64-bit machines** run -``` -wget https://repo.continuum.io/archive/Anaconda-2.3.0-Linux-x86_64.sh -sh Anaconda-2.3.0-Linux-x86_64.sh +```bash +$ wget https://repo.continuum.io/archive/Anaconda-2.3.0-Linux-x86_64.sh +$ sh Anaconda-2.3.0-Linux-x86_64.sh ``` ##### MacOS @@ -83,31 +87,20 @@ Anaconda (NOT miniconda). ##### Create a conda environment Create a **Python 3** conda environment by running -```sh -conda create -n holo_env python=3 + +```bash +$ conda create -n holo_env python=3 ``` Upon starting/restarting your terminal session, you will need to activate your conda environment by running -```sh -source activate holo_env +```bash +$ source activate holo_env ``` **NOTE: ensure your environment is activated throughout the installation process.** -#### Install required packages -```sh -pip install -r requirements.txt -``` - -##### Install the holoclean package - -Install `holoclean` via `pip` -``` -pip install holoclean -``` - -#### Option 2: pip and Virtualenv +##### Option 2: Set up a virtual environment using pip and Virtualenv If you are familiar with Virtualenv, create a new **Python 3** environment with your preferred Virtualenv wrapper, for example: @@ -119,143 +112,41 @@ with your preferred Virtualenv wrapper, for example: Either follow instructions [here](https://virtualenv.pypa.io/en/stable/installation/) or install via `pip` -```sh -pip install virtualenv +```bash +$ pip install virtualenv ``` ##### Create a Virtualenv environment Create a new directory for a **Python 3** virtualenv environment -```sh -mkdir -p holo_env -virtualenv --python=python holo_env +```bash +$ mkdir -p holo_env +$ virtualenv --python=python holo_env ``` where `python` is a valid reference to a python executable. Activate the environment -``` -source bin/activate +```bash +$ source bin/activate ``` **NOTE: ensure your environment is activated throughout the installation process.** -##### Install the holoclean package +##### Install the requirements of HoloClean -Install `holoclean` via `pip` -``` -pip install holoclean -``` - -#### Option 3: Manual (from source) - -You can manually clone this repository -```sh -git clone git@github.com:HoloClean/holoclean.git -cd holoclean -``` +In the project root directory, run the following to install the required packages. +Note that this commands installs the packages within the activated virtual environment. -It is recommended you still create a conda or Virtualenv environment before -installing the package below (see above for instructions for creating either -types of environment). Install the `holoclean` package using `setuptools` by running -```sh -python setup.py install +```bash +$ pip install -r requirements.txt ``` ## Usage -After installation, you can use `holoclean` as a standalone Python module -```python -import holoclean - -### 0. Setup holoclean session - -hc = holoclean.HoloClean() -session = hc.session - -### 1. Load training data and denial constraints - -# 'hospital' is the name of your dataset -# 'data' is the path to the CSV file -# 'hospital.csv' is the CSV filename -session.load_data('hospital', 'data', 'hospital.csv') -# Denial constraints in a TXT file -session.load_dcs('data','hospital_constraints_att.txt') -session.ds.set_constraints(session.get_dcs()) - -### 2. Detect error cells - -detectors = [NullDetector(),ViolationDetector()] -hc.detect_errors(detectors) - -### 3. Repair errors +See the code in `tests/test_holoclean.py` for a documented usage of HoloClean. -hc.setup_domain() -featurizers = [InitFeaturizer(),OccurFeaturizer(), ConstraintFeat()] -hc.repair_errors(featurizers) - -### 4. Evaluate results - -# 'hospital_clean.csv' is the ground truth (i.e. test set labels) -hc.evaluate('data','hospital_clean.csv', get_tid, get_attr, get_value) -``` - - -## Contributing (advanced) - -### Setting up development environment - -It is recommended you create a conda environment when developing (see installation -instructions above for conda). - -1. Create a conda environment for developing holoclean - ```sh - conda create -n holo_dev python=3 - ``` - -2. Activate your environment (**must do this every time you start/restart a new terminal session**): - ```sh - source activate holo_dev - ``` - -3. Install `holoclean` as a local editable package - ```sh - python setup.py develop - ``` - -4. Verify that you've installed it - ```sh - > conda list | grep holoclean - holoclean 0.2.0 - ``` - -5. You should be able to import `holoclean` from anywhere now! - ```sh - python -c "import holoclean" - ``` - -### Testing - -After setting up your development environment and setting up `holoclean` as a -development package, you should be able to run any of the tests under -`tests/`, for example -```sh -sh tests/start_test.sh -``` - -### Building as a conda package - -To build Holoclean as a conda package, first install `conda-build` +In order to run the test script, run the following: +```bash +$ cd tests +$ python test_holoclean.py ``` -conda install conda-build -``` -add the `pytorch` and `conda-forge` channels to your conda config -(`~/.condarc`) if you haven't already done so -``` -conda config --add channels conda-forge -conda config --add channels pytorch -``` -then run the following command in the terminal in this repository: -```sh -conda-build . -``` - diff --git a/__init__.py b/__init__.py index 9c2a3f769..d3a02cdec 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,7 @@ __author__ = "HoloClean" __status__ = "Development" __version__ = "0.2.0" + +from .holoclean import HoloClean + +__all__ = ['HoloClean'] \ No newline at end of file diff --git a/repair/featurize/langmodel.py b/repair/featurize/langmodel.py index 7ab4e7bed..3da5ceae1 100644 --- a/repair/featurize/langmodel.py +++ b/repair/featurize/langmodel.py @@ -17,7 +17,7 @@ def specific_setup(self): raw_data = self.ds.get_raw_data() for attr in self.ds.attr_to_idx: attr_corpus = list(zip(raw_data[attr].tolist())) - model = FastText(attr_corpus,min_count=1,size=self.emb_size) + model = FastText(attr_corpus, min_count=1, size=self.emb_size) self.attr_language_model[attr] = model def gen_feat_tensor(self, input, classes): diff --git a/requirements.txt b/requirements.txt index 66fe7e9e3..c820ddbd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ tqdm==4.15.0 scipy==1.1.0 numpy==1.15.3 pandas==0.23.4 -gensim==3.6.0 +gensim==3.2.0 +enum34==1.1.6 diff --git a/tests/test_holoclean.py b/tests/test_holoclean.py index a7d67ad23..c2e0b450d 100644 --- a/tests/test_holoclean.py +++ b/tests/test_holoclean.py @@ -1,7 +1,8 @@ import sys -sys.path.append("..") -from holoclean import HoloClean +sys.path.insert(0, '..') + +import holoclean from detect import NullDetector, ViolationDetector from repair.featurize import InitFeaturizer @@ -25,13 +26,22 @@ def get_value(row): return row['correct_val'].lower() -hc = HoloClean(pruning_topk=0.1, epochs=30, weight_decay=0.01, threads=20, batch_size=1, verbose=True, timeout=3*60000).session +# 1. Setup a HoloClean session. +hc = holoclean.HoloClean(pruning_topk=0.1, epochs=30, weight_decay=0.01, threads=20, batch_size=1, verbose=True, timeout=3*60000).session + +# 2. Load training data and denial constraints. hc.load_data('hospital', 'data', 'hospital.csv') hc.load_dcs('data', 'hospital_constraints_att.txt') hc.ds.set_constraints(hc.get_dcs()) + +# 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors) + +# 4. Repair errors utilizing the defined features. hc.setup_domain() featurizers = [InitAttFeaturizer(), InitSimFeaturizer(), FreqFeaturizer(), OccurFeaturizer(), LangModelFeat(), ConstraintFeat()] hc.repair_errors(featurizers) + +# 5. Evaluate the correctness of the results. hc.evaluate('data', 'hospital_clean.csv', get_tid, get_attr, get_value)