Skip to content

Commit

Permalink
Fix to how multi-ground truth is handled. Use simplified numerical
Browse files Browse the repository at this point in the history
regression TupleEmbedding with nonlinearity.
  • Loading branch information
richardwu committed Jun 22, 2019
1 parent 5c940a5 commit e5e01d0
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 119 deletions.
16 changes: 5 additions & 11 deletions detect/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,13 @@ def detect_errors(self, detectors):
errors.append(error_df)

# Get unique errors only that might have been detected from multiple detectors.
errors_df = pd.concat(errors, ignore_index=True).drop_duplicates().reset_index(drop=True)
if errors_df.shape[0]:
errors_df['_cid_'] = errors_df.apply(lambda x: self.ds.get_cell_id(x['_tid_'], x['attribute']), axis=1)
logging.info("detected %d potentially erroneous cells", errors_df.shape[0])
self.errors_df = pd.concat(errors, ignore_index=True).drop_duplicates().reset_index(drop=True)
if self.errors_df.shape[0]:
self.errors_df['_cid_'] = self.errors_df.apply(lambda x: self.ds.get_cell_id(x['_tid_'], x['attribute']), axis=1)
logging.info("detected %d potentially erroneous cells", self.errors_df.shape[0])

# Store errors to db.
self.store_detected_errors(errors_df)
# Store the active attributes to Dataset.
self.store_active_attributes(errors_df)
self.store_detected_errors(self.errors_df)
status = "DONE with error detection."
toc_total = time.clock()
detect_time = toc_total - tic_total
Expand All @@ -51,8 +49,4 @@ def store_detected_errors(self, errors_df):
raise Exception("ERROR: Detected errors dataframe is empty.")
self.ds.generate_aux_table(AuxTables.dk_cells, errors_df, store=True)
self.ds.aux_table[AuxTables.dk_cells].create_db_index(self.ds.engine, ['_cid_'])

def store_active_attributes(self, errors_df):
if errors_df.empty:
raise Exception("ERROR: Detected errors dataframe is empty.")
self.ds._active_attributes = sorted(errors_df['attribute'].unique())
7 changes: 7 additions & 0 deletions domain/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ def compute_correlations(self):
self.correlations = compute_norm_cond_entropy_corr(data_df,
self.ds.get_attributes(),
self.ds.get_attributes())
df_corrs = pd.DataFrame.from_dict(self.correlations, orient='columns')
df_corrs.index.name = 'cond_attr'
df_corrs.columns.name = 'attr'
pd.set_option('display.max_columns', len(df_corrs.columns))
pd.set_option('display.max_rows', len(df_corrs.columns))
logging.debug("correlations:\n%s", df_corrs)
logging.debug("summary of correlations:\n%s", df_corrs.describe())

def store_domains(self, domain):
"""
Expand Down
Loading

0 comments on commit e5e01d0

Please sign in to comment.