From 48cc920ae7cb433d26710e8e0d10d66e657debb9 Mon Sep 17 00:00:00 2001 From: Kamil Hryniewicz Date: Thu, 11 Jun 2020 08:50:20 +0200 Subject: [PATCH] Run notebook on subset --- liberia.ipynb | 551 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 536 insertions(+), 15 deletions(-) diff --git a/liberia.ipynb b/liberia.ipynb index 45c1a93de..5d072805c 100644 --- a/liberia.ipynb +++ b/liberia.ipynb @@ -2,7 +2,39 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"testdata/liberia.csv\")\n", + "df.drop([\"quality\", \"problems\"], axis=1, inplace=True)\n", + "df[:1000].to_csv(\"testdata/liberia_noq.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11724" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -21,9 +53,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:25 - [DEBUG] - initiating session with parameters: {'db_user': 'holocleanuser', 'db_pwd': 'abcd1234', 'db_host': 'localhost', 'db_name': 'holo', 'threads': 1, 'timeout': 180000, 'seed': 45, 'layer_sizes': [1], 'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 20, 'weight_decay': 0.01, 'momentum': 0.0, 'batch_size': 1, 'weak_label_thresh': 0.99, 'domain_thresh_1': 0.0, 'domain_thresh_2': 0.0, 'max_domain': 10000, 'cor_strength': 0.6, 'nb_cor_strength': 0.8, 'feature_norm': False, 'weight_norm': False, 'estimator_type': 'NaiveBayes', 'estimator_epochs': 10, 'estimator_batch_size': 32, 'estimator_embedding_size': 10, 'train_attrs': None, 'infer_mode': 'dk', 'verbose': True, 'bias': False, 'print_fw': True, 'debug_mode': False}\n" + ] + } + ], "source": [ "hc = holoclean.HoloClean(\n", " db_name='holo',\n", @@ -54,11 +94,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:25 - [WARNING] - Dropping the following null column from the dataset: 'Region'\n", + "06:44:25 - [WARNING] - Dropping the following null column from the dataset: 'Stroke test'\n", + "06:44:25 - [ INFO] - Loaded 1000 rows with 19000 cells\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [ INFO] - DONE Loading liberia_noq.csv\n", + "06:44:25 - [DEBUG] - Time to load dataset: 0.22 secs\n" + ] + } + ], "source": [ - "hc.load_data(\"liberia\", \"testdata/liberia.csv\")" + "hc.load_data(\"liberia\", \"testdata/liberia_noq.csv\")" ] }, { @@ -99,9 +169,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:25 - [DEBUG] - DONE with Error Detector: NullDetector in 0.04 secs\n", + "06:44:25 - [ INFO] - detected 483 potentially erroneous cells\n", + "06:44:25 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:25 - [ INFO] - DONE with error detection.\n", + "06:44:25 - [DEBUG] - Time to detect errors: 0.08 secs\n" + ] + } + ], "source": [ "detectors = [NullDetector()]\n", "hc.detect_errors(detectors)" @@ -116,31 +198,470 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:25 - [DEBUG] - Computing correlations...\n", + "06:44:32 - [DEBUG] - correlations:\n", + "attr Unnamed: 0 Instance Submission Date Submitter settlement \\\n", + "cond_attr \n", + "CMMoney 0.178756 0.178756 0.182726 0.168557 0.192597 \n", + "Functionality 0.109320 0.109320 0.111965 0.087271 0.113627 \n", + "Geo Code 0.988325 0.988325 0.995203 0.998560 0.997882 \n", + "Instance 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Latitude 0.993098 0.993098 1.000000 1.000000 1.000000 \n", + "Longitude 0.993098 0.993098 1.000000 1.000000 1.000000 \n", + "Submission Date 0.992775 0.992775 1.000000 0.999674 0.999793 \n", + "Submitter 0.585167 0.585167 0.590139 1.000000 0.613940 \n", + "Unnamed: 0 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Wptype 0.026905 0.026905 0.029122 0.088595 0.040718 \n", + "YearCons 0.354940 0.354940 0.358721 0.223316 0.356973 \n", + "committee 0.109022 0.109022 0.111665 0.101341 0.120695 \n", + "drinking? 0.083260 0.083260 0.085769 0.076938 0.089179 \n", + "handpump 0.099625 0.099625 0.102220 0.126282 0.109051 \n", + "partner 0.729834 0.729834 0.735533 0.723274 0.742956 \n", + "photo 0.992976 0.992976 1.000000 1.000000 1.000000 \n", + "quantity 0.173257 0.173257 0.176234 0.122849 0.177834 \n", + "settlement 0.957531 0.957531 0.964371 0.995318 1.000000 \n", + "\n", + "attr Wptype handpump Functionality YearCons partner \\\n", + "cond_attr \n", + "CMMoney 0.951049 0.744687 0.521480 0.314553 0.246513 \n", + "Functionality 0.931390 0.733954 1.000000 0.298901 0.198882 \n", + "Geo Code 0.999436 0.996619 0.994949 0.996656 0.996934 \n", + "Instance 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Latitude 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Longitude 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Submission Date 1.000000 1.000000 1.000000 0.999586 0.999762 \n", + "Submitter 0.969971 0.831470 0.535768 0.482146 0.627726 \n", + "Unnamed: 0 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Wptype 1.000000 0.761506 0.466353 0.302570 0.166037 \n", + "YearCons 0.959124 0.784619 0.498981 1.000000 0.380487 \n", + "committee 0.934076 0.732856 0.466171 0.291265 0.199898 \n", + "drinking? 0.930546 0.729746 0.651284 0.290177 0.180482 \n", + "handpump 0.963661 1.000000 0.474810 0.322976 0.221755 \n", + "partner 0.983062 0.916437 0.698272 0.691148 1.000000 \n", + "photo 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "quantity 0.933570 0.732716 0.535806 0.297367 0.230158 \n", + "settlement 0.997421 0.979733 0.937511 0.955632 0.974529 \n", + "\n", + "attr drinking? quantity committee CMMoney photo Latitude \\\n", + "cond_attr \n", + "CMMoney 0.328969 0.219604 0.000000 1.000000 0.182726 0.182596 \n", + "Functionality 0.562279 0.217212 0.328256 0.430005 0.111897 0.112065 \n", + "Geo Code 0.985859 0.992548 0.996176 0.998757 0.995204 0.995204 \n", + "Instance 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Latitude 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Longitude 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Submission Date 1.000000 1.000000 1.000000 1.000000 0.999798 0.999796 \n", + "Submitter 0.267137 0.324643 0.466151 0.674407 0.590202 0.591021 \n", + "Unnamed: 0 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "Wptype 0.189388 0.151665 0.334559 0.423152 0.029067 0.029304 \n", + "YearCons 0.218376 0.176586 0.345997 0.475619 0.358819 0.359399 \n", + "committee 0.202467 0.149695 1.000000 0.415637 0.111597 0.111808 \n", + "drinking? 1.000000 0.214351 0.334826 0.422561 0.085705 0.085732 \n", + "handpump 0.194244 0.153998 0.336673 0.429497 0.102153 0.102168 \n", + "partner 0.501919 0.513555 0.626205 0.738250 0.735574 0.735927 \n", + "photo 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "quantity 0.325599 1.000000 0.330927 0.425682 0.176156 0.176641 \n", + "settlement 0.898126 0.929603 0.966176 0.987533 0.964377 0.964249 \n", + "\n", + "attr Longitude Geo Code \n", + "cond_attr \n", + "CMMoney 0.182596 0.183674 \n", + "Functionality 0.112065 0.112835 \n", + "Geo Code 0.995204 1.000000 \n", + "Instance 1.000000 1.000000 \n", + "Latitude 1.000000 1.000000 \n", + "Longitude 1.000000 1.000000 \n", + "Submission Date 0.999796 0.999795 \n", + "Submitter 0.591021 0.593579 \n", + "Unnamed: 0 1.000000 1.000000 \n", + "Wptype 0.029304 0.030612 \n", + "YearCons 0.359399 0.360306 \n", + "committee 0.111808 0.112987 \n", + "drinking? 0.085732 0.086003 \n", + "handpump 0.102168 0.102700 \n", + "partner 0.735927 0.737075 \n", + "photo 1.000000 1.000000 \n", + "quantity 0.176641 0.177145 \n", + "settlement 0.964249 0.966866 \n", + "06:44:32 - [DEBUG] - summary of correlations:\n", + "attr Unnamed: 0 Instance Submission Date Submitter settlement \\\n", + "count 18.000000 18.000000 18.000000 18.000000 18.000000 \n", + "mean 0.575994 0.575994 0.580204 0.595110 0.586402 \n", + "std 0.418003 0.418003 0.419176 0.439011 0.418665 \n", + "min 0.026905 0.026905 0.029122 0.076938 0.040718 \n", + "25% 0.125304 0.125304 0.128032 0.123707 0.134980 \n", + "50% 0.657501 0.657501 0.662836 0.859296 0.678448 \n", + "75% 0.992926 0.992926 1.000000 1.000000 1.000000 \n", + "max 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "\n", + "attr Wptype handpump Functionality YearCons partner drinking? \\\n", + "count 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 \n", + "mean 0.975184 0.885797 0.765632 0.680165 0.634620 0.648576 \n", + "std 0.028479 0.122744 0.240640 0.336962 0.386167 0.361597 \n", + "min 0.930546 0.729746 0.466171 0.290177 0.166037 0.189388 \n", + "25% 0.953068 0.748892 0.525052 0.305566 0.223856 0.281752 \n", + "50% 0.990242 0.948085 0.817891 0.823390 0.801128 0.730202 \n", + "75% 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "\n", + "attr quantity committee CMMoney photo Latitude Longitude \\\n", + "count 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000 \n", + "mean 0.613526 0.670330 0.745617 0.580182 0.580328 0.580328 \n", + "std 0.397393 0.353261 0.273618 0.419189 0.419106 0.419106 \n", + "min 0.149695 0.000000 0.415637 0.029067 0.029304 0.029304 \n", + "25% 0.215066 0.335288 0.429624 0.127962 0.128209 0.128209 \n", + "50% 0.721579 0.796191 0.862891 0.662888 0.663474 0.663474 \n", + "75% 1.000000 1.000000 1.000000 0.999950 0.999949 0.999949 \n", + "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "\n", + "attr Geo Code \n", + "count 18.000000 \n", + "mean 0.581310 \n", + "std 0.419157 \n", + "min 0.030612 \n", + "25% 0.129027 \n", + "50% 0.665327 \n", + "75% 1.000000 \n", + "max 1.000000 \n", + "06:44:32 - [DEBUG] - computing frequency and co-occurrence statistics from raw data...\n", + "06:44:32 - [DEBUG] - Collecting single/pair-wise statistics...\n", + "06:44:34 - [DEBUG] - DONE computing statistics in 2.41s\n", + "06:44:34 - [DEBUG] - preparing pruned co-occurring statistics...\n", + "100%|██████████| 18/18 [00:00<00:00, 49.43it/s]\n", + "06:44:34 - [DEBUG] - DONE with pruned co-occurring statistics in 0.37 secs\n", + "06:44:34 - [DEBUG] - generating initial set of un-pruned domain values...\n", + "100%|██████████| 1000/1000 [00:00<00:00, 1472.86it/s]\n", + "06:44:35 - [DEBUG] - domain size stats: count 6970.000000\n", + "mean 11.580631\n", + "std 15.005491\n", + "min 2.000000\n", + "25% 4.000000\n", + "50% 6.000000\n", + "75% 12.000000\n", + "max 80.000000\n", + "Name: domain_size, dtype: float64\n", + "06:44:35 - [DEBUG] - domain count by attr: partner 999\n", + "Latitude 998\n", + "Geo Code 998\n", + "Longitude 998\n", + "YearCons 993\n", + "CMMoney 992\n", + "Submitter 992\n", + "Name: attribute, dtype: int64\n", + "06:44:35 - [DEBUG] - DONE generating initial set of domain values in 0.72s\n", + "06:44:36 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:36 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:36 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:36 - [DEBUG] - Time to create table: 0.00 secs\n", + "06:44:37 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:37 - [ INFO] - DONE with domain preparation.\n", + "06:44:37 - [DEBUG] - Time to generate the domain: 11.21 secs\n" + ] + } + ], "source": [ "hc.generate_domain()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:37 - [DEBUG] - training estimator for estimating domain value probabilities...\n", + "06:44:37 - [DEBUG] - using estimator: NaiveBayes\n", + "06:44:37 - [DEBUG] - DONE training estimator in 0.01s\n", + "06:44:37 - [DEBUG] - predicting domain value probabilities from estimator...\n", + "06:44:37 - [DEBUG] - DONE predictions in 0.00 secs, re-constructing cell domain...\n", + "06:44:37 - [DEBUG] - re-assembling final cell domain table...\n", + "100%|██████████| 6970/6970 [00:02<00:00, 3115.74it/s]\n", + "100%|██████████| 6970/6970 [00:00<00:00, 142325.85it/s]\n", + "06:44:39 - [DEBUG] - DONE assembling cell domain table in 2.52s\n", + "06:44:39 - [ INFO] - number of (additional) weak labels assigned from estimator: 144\n", + "06:44:39 - [DEBUG] - DONE generating domain and weak labels\n", + "06:44:40 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to create index: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to create table: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to create index: 0.00 secs\n" + ] + } + ], "source": [ "hc.run_estimator()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06:44:40 - [DEBUG] - Time to execute query: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to execute query: 0.00 secs\n", + "06:44:40 - [DEBUG] - Time to execute query: 0.00 secs\n", + "06:44:40 - [DEBUG] - featurizing training data...\n", + "06:44:40 - [DEBUG] - Time to execute query: 0.01 secs\n", + "100%|██████████| 6970/6970 [00:12<00:00, 559.48it/s]\n", + "06:44:54 - [DEBUG] - DONE featurization. Feature tensor size: torch.Size([6970, 80, 144])\n", + "06:44:54 - [DEBUG] - generating weak labels...\n", + "06:44:54 - [DEBUG] - Time to execute query: 0.00 secs\n", + "100%|██████████| 6661/6661 [00:00<00:00, 71341.86it/s]\n", + "06:44:54 - [DEBUG] - DONE generating weak labels.\n", + "06:44:54 - [DEBUG] - generating mask...\n", + "06:44:54 - [DEBUG] - Time to execute query: 0.00 secs\n", + "100%|██████████| 6970/6970 [00:00<00:00, 103917.20it/s]\n", + "06:44:54 - [DEBUG] - DONE generating mask.\n", + "06:44:54 - [ INFO] - DONE setting up featurized dataset.\n", + "06:44:54 - [DEBUG] - Time to featurize data: 13.99 secs\n", + "06:44:54 - [DEBUG] - training model with first layer size: ['torch.Size([18, 1])', 'torch.Size([126, 1])']\n", + "06:44:54 - [ INFO] - DONE setting up repair model.\n", + "06:44:54 - [DEBUG] - Time to setup repair model: 13.99 secs\n", + "06:44:54 - [ INFO] - training with 6661 training examples (cells)\n", + " 0%| | 0/20 [00:00