# Source code for Orange.distance.base

```
# This module defines abstract base classes; derived classes are abstract, too
# pylint: disable=abstract-method
import numpy as np
import sklearn.metrics as skl_metrics
from Orange.data import Table, Domain, Instance, RowInstance
from Orange.misc import DistMatrix
from Orange.preprocess import SklImpute
from Orange.statistics import util
# TODO: When we upgrade to numpy 1.13, change use argument copy=False in
# nan_to_num instead of assignment
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary
# Afterwards, also remove the following line:
# pylint: disable=redefined-outer-name
def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table): # this can be an array, pylint: disable=len-as-condition
return table
new_domain = Domain(
[a for a in table.domain.attributes if a.is_continuous],
table.domain.class_vars,
table.domain.metas)
new_data = table.transform(new_domain)
if impute:
new_data = SklImpute()(new_data)
return new_data
# TODO I have put this function here as a substitute the above `_preprocess`.
# None of them really belongs here; (re?)move them, eventually.
def remove_discrete_features(data):
"""Remove discrete columns from the data."""
new_domain = Domain(
[a for a in data.domain.attributes if a.is_continuous],
data.domain.class_vars,
data.domain.metas)
return data.transform(new_domain)
def remove_nonbinary_features(data):
"""Remove non-binary columns from the data."""
new_domain = Domain(
[a for a in data.domain.attributes
if a.is_discrete and len(a.values) == 2],
data.domain.class_vars,
data.domain.metas)
return data.transform(new_domain)
def impute(data):
"""Impute missing values."""
return SklImpute()(data)
def _orange_to_numpy(x):
"""
Return :class:`numpy.ndarray` (dense or sparse) with attribute data
from the given instance of :class:`Orange.data.Table`,
:class:`Orange.data.RowInstance` or :class:`Orange.data.Instance`.
"""
if isinstance(x, Table):
return x.X
elif isinstance(x, Instance):
return np.atleast_2d(x.x)
elif isinstance(x, np.ndarray):
return np.atleast_2d(x)
else:
return x # e.g. None
[docs]class Distance:
"""
Base class for construction of distances models (:obj:`DistanceModel`).
Distances can be computed between all pairs of rows in one table, or
between pairs where one row is from one table and one from another.
If `axis` is set to `0`, the class computes distances between all pairs
of columns in a table. Distances between columns from separate tables are
probably meaningless, thus unsupported.
The class can be used as follows:
- Constructor is called only with keyword argument `axis` that
specifies the axis over which the distances are computed, and with other
subclass-specific keyword arguments.
- Next, we call the method `fit(data)` to produce an instance of
:obj:`DistanceModel`; the instance stores any parameters needed for
computation of distances, such as statistics for normalization and
handling of missing data.
- We can then call the :obj:`DistanceModel` with data to compute the
distance between its rows or columns, or with two data tables to
compute distances between all pairs of rows.
The second, shorter way to use this class is to call the constructor with
one or two data tables and any additional keyword arguments. Constructor
will execute the above steps and return :obj:`~Orange.misc.DistMatrix`.
Such usage is here for backward compatibility, practicality and efficiency.
Args:
e1 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
:obj:`np.ndarray` or `None`):
data on which to train the model and compute the distances
e2 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
:obj:`np.ndarray` or `None`):
if present, the class computes distances with pairs coming from
the two tables
axis (int):
axis over which the distances are computed, 1 (default) for
rows, 0 for columns
impute (bool):
if `True` (default is `False`), nans in the computed distances
are replaced with zeros, and infs with very large numbers.
callback (callable or None):
callback function
Attributes:
axis (int):
axis over which the distances are computed, 1 (default) for
rows, 0 for columns
impute (bool):
if `True` (default is `False`), nans in the computed distances
are replaced with zeros, and infs with very large numbers.
normalize (bool):
if `True`, columns are normalized before computation. This attribute
applies only if the distance supports normalization.
The capabilities of the metrics are described with class attributes.
If class attribute `supports_discrete` is `True`, the distance
also uses discrete attributes to compute row distances. The use of discrete
attributes depends upon the type of distance; e.g. Jaccard distance observes
whether the value is zero or non-zero, while Euclidean and Manhattan
distance observes whether a pair of values is same or different.
Class attribute `supports_missing` indicates that the distance can cope
with missing data. In such cases, letting the distance handle it should
be preferred over pre-imputation of missing values.
Class attribute `supports_normalization` indicates that the constructor
accepts an argument `normalize`. If set to `True`, the metric will attempt
to normalize the values in a sense that each attribute will have equal
influence. For instance, the Euclidean distance subtract the mean and
divides the result by the deviation, while Manhattan distance uses the
median and MAD.
If class attribute `supports_sparse` is `True`, the class will handle
sparse data. Currently, all classes that do handle it rely on fallbacks to
SKL metrics. These, however, do not support discrete data and missing
values, and will fail silently.
"""
supports_sparse = False
supports_discrete = False
supports_normalization = False
supports_missing = True
# Predefined here to silence pylint, which doesn't look into __new__
normalize = False
axis = 1
impute = False
def __new__(cls, e1=None, e2=None, axis=1, impute=False,
callback=None, **kwargs):
self = super().__new__(cls)
self.axis = axis
self.impute = impute
self.callback = callback
# Ugly, but needed to allow allow setting subclass-specific parameters
# (such as normalize) when `e1` is not `None` and the `__new__` in the
# subclass is skipped
self.__dict__.update(**kwargs)
if e1 is None:
return self
# Fallbacks for sparse data and numpy tables. Remove when subclasses
# no longer use fallbacks for sparse data, and handling numpy tables
# becomes obsolete (or handled elsewhere)
if (not hasattr(e1, "domain")
or hasattr(e1, "is_sparse") and e1.is_sparse()):
fallback = getattr(self, "fallback", None)
if fallback is not None:
# pylint: disable=not-callable
return fallback(e1, e2, axis, impute)
# Magic constructor
model = self.fit(e1)
return model(e1, e2)
def fit(self, data):
"""
Abstract method returning :obj:`DistanceModel` fit to the data
Args:
e1 (Orange.data.Table, Orange.data.Instance, np.ndarray):
data for fitting the distance model
Returns:
model (DistanceModel)
"""
raise NotImplementedError
@staticmethod
def check_no_discrete(n_vals):
"""
Raise an exception if there are any discrete attributes.
Args:
n_vals (list of int): number of attributes values, 0 for continuous
"""
if any(n_vals):
raise ValueError("columns with discrete values are incommensurable")
class DistanceModel:
"""
Base class for classes that compute distances between data rows or columns.
Instances of these classes are not constructed directly but returned by
the corresponding instances of :obj:`Distance`.
Attributes:
axis (int, readonly):
axis over which the distances are computed, 1 (default) for
rows, 0 for columns
impute (bool):
if `True` (default is `False`), nans in the computed distances
are replaced with zeros, and infs with very large numbers
callback (callable or None):
callback function
"""
def __init__(self, axis, impute=False, callback=None):
self._axis = axis
self.impute = impute
self.callback = callback
@property
def axis(self):
return self._axis
def __call__(self, e1, e2=None):
"""
If e2 is omitted, calculate distances between all rows (axis=1) or
columns (axis=2) of e1. If e2 is present, calculate distances between
all pairs if rows from e1 and e2.
This method converts the data into numpy arrays, calls the method
`compute_data` and packs the result into `DistMatrix`. Subclasses are
expected to define the `compute_data` and not the `__call__` method.
Args:
e1 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
input data
e2 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
secondary data
Returns:
A distance matrix (Orange.misc.distmatrix.DistMatrix)
"""
if self.axis == 0 and e2 is not None:
# Backward compatibility fix
if e2 is e1:
e2 = None
else:
raise ValueError("Two tables cannot be compared by columns")
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
with np.errstate(invalid="ignore"): # nans are handled below
dist = self.compute_distances(x1, x2)
if self.impute and np.isnan(dist).any():
dist = np.nan_to_num(dist)
if isinstance(e1, (Table, RowInstance)):
dist = DistMatrix(dist, e1, e2, self.axis)
else:
dist = DistMatrix(dist)
return dist
def compute_distances(self, x1, x2):
"""
Abstract method for computation of distances between rows or columns of
`x1`, or between rows of `x1` and `x2`. Do not call directly."""
raise NotImplementedError
class FittedDistanceModel(DistanceModel):
"""
Base class for models that store attribute-related data for normalization
and imputation, and that treat discrete and continuous columns separately.
Attributes:
attributes (list of `Variable`): attributes on which the model was fit
discrete (np.ndarray): bool array indicating discrete attributes
continuous (np.ndarray): bool array indicating continuous attributes
normalize (bool):
if `True` (default is `False`) continuous columns are normalized
callback (callable or None): callback function
"""
def __init__(self, attributes, axis=1, impute=False, callback=None):
super().__init__(axis, impute, callback)
self.attributes = attributes
self.discrete = None
self.continuous = None
self.normalize = False
def __call__(self, e1, e2=None):
if self.attributes is not None and (
e1.domain.attributes != self.attributes
or e2 is not None and e2.domain.attributes != self.attributes):
raise ValueError("mismatching domains")
return super().__call__(e1, e2)
def continuous_columns(self, x1, x2, offset, scale):
"""
Extract and scale continuous columns from data tables.
If the second table is None, it defaults to the first table.
Values are scaled if `self.normalize` is `True`.
Args:
x1 (np.ndarray): first table
x2 (np.ndarray or None): second table
offset (float): a constant (e.g. mean, median) subtracted from data
scale: (float): divider (e.g. deviation)
Returns:
data1 (np.ndarray): scaled continuous columns from `x1`
data2 (np.ndarray): scaled continuous columns from `x2` or `x1`
"""
if self.continuous.all() and not self.normalize:
data1, data2 = x1, x2
else:
data1 = x1[:, self.continuous]
if x2 is not None:
data2 = x2[:, self.continuous]
if self.normalize:
data1 = x1[:, self.continuous]
data1 -= offset
data1 /= scale
if x2 is not None:
data2 = x2[:, self.continuous]
data2 -= offset
data2 /= scale
if x2 is None:
data2 = data1
return data1, data2
def discrete_columns(self, x1, x2):
"""
Return discrete columns from the given tables.
If the second table is None, it defaults to the first table.
"""
if self.discrete.all():
data1, data2 = x1, x1 if x2 is None else x2
else:
data1 = x1[:, self.discrete]
data2 = data1 if x2 is None else x2[:, self.discrete]
return data1, data2
class FittedDistance(Distance):
"""
Base class for fitting models that store attribute-related data for
normalization and imputation, and that treat discrete and continuous
columns separately.
The class implements a method `fit` that calls either `fit_columns`
or `fit_rows` with the data and the number of values for discrete
attributes. The provided method `fit_rows` calls methods
`get_discrete_stats` and `get_continuous_stats` that can be implemented
in derived classes.
Class attribute `rows_model_type` contains the type of the model returned by
`fit_rows`.
"""
rows_model_type = None #: Option[FittedDistanceModel]
def fit(self, data):
"""
Prepare the data on attributes, call `fit_cols` or `fit_rows` and
return the resulting model.
"""
x = _orange_to_numpy(data)
if hasattr(data, "domain"):
attributes = data.domain.attributes
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
else:
assert isinstance(x, np.ndarray)
attributes = None
n_vals = np.zeros(x.shape[1], dtype=np.int32)
return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)
def fit_cols(self, attributes, x, n_vals):
"""
Return DistanceModel for computation of distances between columns.
Derived classes must define this method.
Args:
attributes (list of Orange.data.Variable): list of attributes
x (np.ndarray): data
n_vals (np.ndarray): number of attribute values, 0 for continuous
"""
raise NotImplementedError
def fit_rows(self, attributes, x, n_vals):
"""
Return a DistanceModel for computation of distances between rows.
The model type is `self.row_distance_model`. It stores the data for
imputation of discrete and continuous values, and for normalization
of continuous values. Typical examples are the Euclidean and Manhattan
distance, for which the following data is stored:
For continuous columns:
- offsets[col] is the number subtracted from values in column `col`
- scales[col] is the divisor for values in columns `col`
- dist_missing2_cont[col]: the value used for distance between two
missing values in column `col`
For discrete values:
- dist_missing_disc[col, value] is the distance added for the given
`value` in the column `col` if the value for the other row is
missing
- dist_missing2_disc[col]: the distance between two missing values in
column `col`
"""
n_cols = len(n_vals)
discrete = n_vals > 0
n_bins = max(n_vals, default=0)
n_discrete = sum(discrete)
dist_missing_disc = np.zeros((n_discrete, n_bins), dtype=float)
dist_missing2_disc = np.zeros(n_discrete, dtype=float)
continuous = ~discrete
n_continuous = sum(continuous)
offsets = np.zeros(n_continuous, dtype=float)
scales = np.empty(n_continuous, dtype=float)
dist_missing2_cont = np.zeros(n_continuous, dtype=float)
curr_disc = curr_cont = 0
for col in range(n_cols):
column = x[:, col]
if np.isnan(column).all():
continuous[col] = discrete[col] = False
elif discrete[col]:
discrete_stats = self.get_discrete_stats(column, n_bins)
if discrete_stats is not None:
dist_missing_disc[curr_disc], \
dist_missing2_disc[curr_disc] = discrete_stats
curr_disc += 1
else:
continuous_stats = self.get_continuous_stats(column)
if continuous_stats is not None:
offsets[curr_cont], scales[curr_cont],\
dist_missing2_cont[curr_cont] = continuous_stats
curr_cont += 1
else:
continuous[col] = False
# pylint: disable=not-callable
return self.rows_model_type(
attributes, impute, getattr(self, "normalize", False),
continuous, discrete,
offsets[:curr_cont], scales[:curr_cont],
dist_missing2_cont[:curr_cont],
dist_missing_disc, dist_missing2_disc,
self.callback)
@staticmethod
def get_discrete_stats(column, n_bins):
"""
Return tables used computing distance between missing discrete values.
Args:
column (np.ndarray): column data
n_bins (int): maximal number of bins in the dataset
Returns:
dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is
1 - probability of `value`, which is used as the distance added
for the given `value` in the column `col` if the value for the
other row is missing
dist_missing2_disc (float): the distance between two missing
values in this columns
"""
dist = util.bincount(column, minlength=n_bins)[0]
dist /= max(1, sum(dist))
return 1 - dist, 1 - np.sum(dist ** 2)
def get_continuous_stats(self, column):
"""
Compute statistics for imputation and normalization of continuous data.
Derived classes must define this method.
Args:
column (np.ndarray): column data
Returns:
offset (float): the number subtracted from values in column
scales (float): the divisor for values in column
dist_missing2_cont (float): the value used for distance between two
missing values in column
"""
raise NotImplementedError
# Fallbacks for distances in sparse data
# To be removed as the corresponding functionality is implemented properly
class SklDistance:
"""
Wrapper for functions sklearn's metrics. Used only as temporary fallbacks
when `Euclidean`, `Manhattan`, `Cosine` or `Jaccard` are given sparse data
or raw numpy arrays. These classes can't handle discrete or missing data
and normalization. Do not use for wrapping new classes.
"""
def __init__(self, metric):
self.metric = metric
def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
dist = skl_metrics.pairwise.pairwise_distances(
x1, x2, metric=self.metric)
if impute and np.isnan(dist).any():
dist = np.nan_to_num(dist)
if isinstance(e1, (Table, RowInstance)):
dist_matrix = DistMatrix(dist, e1, e2, axis)
else:
dist_matrix = DistMatrix(dist)
return dist_matrix
```