Skip to content

Commit

Permalink
Fixed z-scoring with 0 std and deleting e-notation numerical values.
Browse files Browse the repository at this point in the history
  • Loading branch information
richardwu committed Jun 22, 2019
1 parent bbe68e7 commit ba1cc4b
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions domain/estimators/tuple_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ..estimator import Estimator
from utils import NULL_REPR

NONNUMERICS = "[^0-9+-.]"
NONNUMERICS = "[^0-9+-.e]"

def verify_numerical_attr_groups(dataset, numerical_attr_groups):
"""
Expand Down Expand Up @@ -141,7 +141,7 @@ def __init__(self, env, dataset, domain_df,
self._num_attrs_std[num_attr] = temp[fil_notnull].astype(np.float).std(axis=0)
temp[fil_notnull] = ((temp[fil_notnull].astype(np.float) \
- self._num_attrs_mean[num_attr]) \
/ self._num_attrs_std[num_attr]).astype(str)
/ (self._num_attrs_std[num_attr] or 1.)).astype(str)
self._raw_data[num_attr] = temp

# Indexes assigned to attributes: first categorical then numerical.
Expand Down Expand Up @@ -643,17 +643,17 @@ def __init__(self, env, dataset, domain_df,
self._embed_size)
raise Exception()
# Convert non numerical init values in numerical attributes with _nan_.
if self._numerical_attrs is not None:
fil_attr = self.domain_df['attribute'].isin(self._numerical_attrs)
fil_notnull = self.domain_df['weak_label'] != NULL_REPR
fil_notnumeric = self.domain_df['weak_label'].str.contains(NONNUMERICS)
bad_numerics = fil_attr & fil_notnull & fil_notnumeric
if bad_numerics.sum():
self.domain_df.loc[bad_numerics, 'weak_label'] = NULL_REPR
logging.warning('%s: replaced %d non-numerical values in DOMAIN as "%s" (NULL)',
type(self).__name__,
bad_numerics.sum(),
NULL_REPR)
# if self._numerical_attrs is not None:
# fil_attr = self.domain_df['attribute'].isin(self._numerical_attrs)
# fil_notnull = self.domain_df['weak_label'] != NULL_REPR
# fil_notnumeric = self.domain_df['weak_label'].str.contains(NONNUMERICS)
# bad_numerics = fil_attr & fil_notnull & fil_notnumeric
# if bad_numerics.sum():
# self.domain_df.loc[bad_numerics, 'weak_label'] = NULL_REPR
# logging.warning('%s: replaced %d non-numerical values in DOMAIN as "%s" (NULL)',
# type(self).__name__,
# bad_numerics.sum(),
# NULL_REPR)
# Remove domain for numerical attributes.
fil_numattr = self.domain_df['attribute'].isin(self._numerical_attrs)

Expand Down

0 comments on commit ba1cc4b

Please sign in to comment.