import random
import re
from math import isnan
from numbers import Real
import numpy as np
import bottleneck as bn
from Orange.util import Reprable
from Orange.data import Instance, Storage, Variable
from Orange.util import Enum
__all__ = ["IsDefined", "HasClass", "Random", "SameValue", "Values",
"FilterDiscrete", "FilterContinuous", "FilterString",
"FilterStringList", "FilterRegex"]
class Filter(Reprable):
"""
The base class for filters.
.. attribute:: negate
Reverts the selection
"""
def __init__(self, negate=False):
self.negate = negate
def __call__(self, data):
return
def __eq__(self, other):
return type(self) is type(other) and self.negate == other.negate
def __hash__(self):
return hash(self.negate)
[docs]
class IsDefined(Filter):
"""
Select the data instances with no undefined values. The check can be
restricted to a subset of columns.
The filter's behaviour may depend upon the storage implementation.
In particular, :obj:`~Orange.data.Table` with sparse matrix representation
will select all data instances whose values are defined, even if they are
zero. However, if individual columns are checked, it will select all
rows with non-zero entries for this columns, disregarding whether they
are stored as zero or omitted.
.. attribute:: columns
The columns to be checked, given as a sequence of indices, names or
:obj:`Orange.data.Variable`.
"""
def __init__(self, columns=None, negate=False):
super().__init__(negate)
self.columns = tuple(columns) if columns is not None else None
def __call__(self, data):
if isinstance(data, Instance):
return self.negate == bn.anynan(data._x)
if isinstance(data, Storage):
try:
return data._filter_is_defined(self.columns, self.negate)
except NotImplementedError:
pass
r = np.fromiter((not bn.anynan(inst._x) for inst in data),
dtype=bool, count=len(data))
if self.negate:
r = np.logical_not(r)
return data[r]
def __eq__(self, other):
return super().__eq__(other) and self.columns == other.columns
def __hash__(self):
return hash((super().__hash__(), hash(self.columns)))
[docs]
class HasClass(Filter):
"""
Return all rows for which the class value is known.
:obj:`Orange.data.Table` implements the filter on the sparse data so that it
returns all rows for which all class values are defined, even if they
equal zero.
"""
def __call__(self, data):
if isinstance(data, Instance):
return self.negate == bn.anynan(data._y)
if isinstance(data, Storage):
try:
return data._filter_has_class(self.negate)
except NotImplementedError:
pass
r = np.fromiter((not bn.anynan(inst._y) for inst in data), bool, len(data))
if self.negate:
r = np.logical_not(r)
return data[r]
[docs]
class Random(Filter):
"""
Return a random selection of data instances.
.. attribute:: prob
The proportion (if below 1) or the probability (if 1 or above) of
selected instances
"""
def __init__(self, prob=None, negate=False):
super().__init__(negate)
self.prob = prob
def __call__(self, data):
if isinstance(data, Instance):
return self.negate != (random.random() < self.prob)
if isinstance(data, Storage):
try:
return data._filter_random(self.prob, self.negate)
except NotImplementedError:
pass
retain = np.zeros(len(data), dtype=bool)
n = int(self.prob) if self.prob >= 1 else int(self.prob * len(data))
if self.negate:
retain[n:] = True
else:
retain[:n] = True
np.random.shuffle(retain)
return data[retain]
[docs]
class SameValue(Filter):
"""
Return the data instances with the given value in the specified column.
.. attribute:: column
The column, described by an index, a string or
:obj:`Orange.data.Variable`.
.. attribute:: value
The reference value
"""
def __init__(self, column, value, negate=False):
super().__init__(negate)
self.column = column
self.value = value
def __call__(self, data):
if isinstance(data, Instance):
return self.negate != (data[self.column] == self.value)
if isinstance(data, Storage):
try:
return data._filter_same_value(self.column, self.value, self.negate)
except NotImplementedError:
pass
column = data.domain.index(self.column)
if (data.domain[column].is_primitive() and
not isinstance(self.value, Real)):
value = data.domain[column].to_val(self.value)
else:
value = self.value
if column >= 0:
if self.negate:
retain = np.fromiter(
(inst[column] != value for inst in data),
bool, len(data))
else:
retain = np.fromiter(
(inst[column] == value for inst in data),
bool, len(data))
else:
column = -1 - column
if self.negate:
retain = np.fromiter(
(inst._metas[column] != value for inst in data),
bool, len(data))
else:
retain = np.fromiter(
(inst._metas[column] == value for inst in data),
bool, len(data))
return data[retain]
[docs]
class Values(Filter):
"""
Select the data instances based on conjunction or disjunction of filters
derived from :obj:`ValueFilter` that check values of individual features
or another (nested) Values filter.
.. attribute:: conditions
A list of conditions, derived from :obj:`ValueFilter` or :obj:`Values`
.. attribute:: conjunction
If `True`, the filter computes a conjunction, otherwise a disjunction
.. attribute:: negate
Revert the selection
"""
def __init__(self, conditions, conjunction=True, negate=False):
super().__init__(negate)
self.conjunction = conjunction
if not conditions:
raise ValueError("Filter with no conditions.")
self.conditions = conditions
def __call__(self, data):
if isinstance(data, Instance):
agg = all if self.conjunction else any
return self.negate != agg(cond(data) for cond in self.conditions)
if isinstance(data, Storage):
try:
return data._filter_values(self)
except NotImplementedError:
pass
N = len(data)
if self.conjunction:
sel, agg = np.ones(N, bool), np.logical_and
else:
sel, agg = np.zeros(N, bool), np.logical_or
for cond in self.conditions:
sel = agg(sel, np.fromiter((cond(inst) for inst in data), bool, count=N))
if self.negate:
sel = np.logical_not(sel)
return data[sel]
class ValueFilter(Filter):
"""
The base class for subfilters that check individual values of data
instances. Derived classes handle discrete, continuous and string
attributes. These filters are used to compose conditions in
:obj:`Orange.data.filter.Values`.
The internal implementation of `filter.Values` in data storages, like
:obj:`Orange.data.Table`, recognize these filters and retrieve their,
attributes, like operators and reference values, but do not call them.
The fallback implementation of :obj:`Orange.data.filter.Values` calls
the subfilters with individual data instances, which is very inefficient.
.. attribute:: column
The column to which the filter applies (int, str or
:obj:`Orange.data.Variable`).
"""
def __init__(self, column):
super().__init__()
self.column = column
[docs]
class FilterDiscrete(ValueFilter):
"""
Subfilter for discrete variables, which selects the instances whose
value matches one of the given values.
.. attribute:: column
The column to which the filter applies (int, str or
:obj:`Orange.data.Variable`).
.. attribute:: values
The list (or a set) of accepted values. If None, it checks whether
the value is defined.
"""
def __init__(self, column, values):
super().__init__(column)
self.values = values
def __call__(self, inst):
value = inst[inst.domain.index(self.column)]
if self.values is None:
return not isnan(value)
else:
return value in self.values
def __eq__(self, other):
return isinstance(other, FilterDiscrete) and \
self.column == other.column and self.values == other.values
[docs]
class FilterContinuous(ValueFilter):
"""
Subfilter for continuous variables.
.. attribute:: column
The column to which the filter applies (int, str or
:obj:`Orange.data.Variable`).
.. attribute:: ref
The reference value; also aliased to `min` for operators
`Between` and `Outside`.
.. attribute:: max
The upper threshold for operators `Between` and `Outside`.
.. attribute:: oper
The operator; should be `FilterContinuous.Equal`, `NotEqual`, `Less`,
`LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside` or
`IsDefined`.
"""
Type = Enum('FilterContinuous',
'Equal, NotEqual, Less, LessEqual, Greater,'
'GreaterEqual, Between, Outside, IsDefined')
(Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual,
Between, Outside, IsDefined) = Type
def __init__(self, position, oper, ref=None, max=None, min=None):
super().__init__(position)
self.ref = ref if min is None else min
self.max = max
self.oper = oper
self.position = position
@property
def min(self):
return self.ref
@min.setter
def min(self, value):
self.ref = value
def __call__(self, inst):
value = inst[inst.domain.index(self.column)]
if isnan(value):
return self.oper == self.Equal and isnan(self.ref)
if self.oper == self.Equal:
return value == self.ref
if self.oper == self.NotEqual:
return value != self.ref
if self.oper == self.Less:
return value < self.ref
if self.oper == self.LessEqual:
return value <= self.ref
if self.oper == self.Greater:
return value > self.ref
if self.oper == self.GreaterEqual:
return value >= self.ref
if self.oper == self.Between:
return self.ref <= value <= self.max
if self.oper == self.Outside:
return not self.ref <= value <= self.max
if self.oper == self.IsDefined:
return True
raise ValueError("invalid operator")
def __eq__(self, other):
return isinstance(other, FilterContinuous) and \
self.column == other.column and self.oper == other.oper and \
self.ref == other.ref and self.max == other.max
def __str__(self):
if isinstance(self.column, str):
column = self.column
elif isinstance(self.column, Variable):
column = self.column.name
else:
column = "feature({})".format(self.column)
names = {self.Equal: "=", self.NotEqual: "≠",
self.Less: "<", self.LessEqual: "≤",
self.Greater: ">", self.GreaterEqual: "≥"}
if self.oper in names:
return "{} {} {}".format(column, names[self.oper], self.ref)
if self.oper == self.Between:
return "{} ≤ {} ≤ {}".format(self.min, column, self.max)
if self.oper == self.Outside:
return "not {} ≤ {} ≤ {}".format(self.min, column, self.max)
if self.oper == self.IsDefined:
return "{} is defined".format(column)
return "invalid operator"
[docs]
class FilterString(ValueFilter):
"""
Subfilter for string variables.
.. attribute:: column
The column to which the filter applies (int, str or
:obj:`Orange.data.Variable`).
.. attribute:: ref
The reference value; also aliased to `min` for operators
`Between` and `Outside`.
.. attribute:: max
The upper threshold for operators `Between` and `Outside`.
.. attribute:: oper
The operator; should be `FilterString.Equal`, `NotEqual`, `Less`,
`LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside`,
`Contains`, `NotContain`, `StartsWith`, `NotStartsWith`, `EndsWith`, `NotEndsWith`, `IsDefined` or `NotIsDefined`.
.. attribute:: case_sensitive
Tells whether the comparisons are case sensitive
"""
Type = Enum('FilterString',
'Equal, NotEqual, Less, LessEqual, Greater,'
'GreaterEqual, Between, Outside, Contains, NotContain,'
'StartsWith, NotStartsWith, EndsWith, NotEndsWith, IsDefined, NotIsDefined')
(Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual,
Between, Outside, Contains, NotContain, StartsWith, NotStartsWith, EndsWith, NotEndsWith, IsDefined, NotIsDefined) = Type
def __init__(self, position, oper, ref=None, max=None,
case_sensitive=True, **a):
super().__init__(position)
if a:
if len(a) != 1 or "min" not in a:
raise TypeError(
"FilterContinuous got unexpected keyword arguments")
else:
ref = a["min"]
self.ref = ref
self.max = max
self.oper = oper
self.case_sensitive = case_sensitive
self.position = position
@property
def min(self):
return self.ref
@min.setter
def min(self, value):
self.ref = value
def __call__(self, inst):
# the function is a large 'switch'; pylint: disable=too-many-branches
value = inst[inst.domain.index(self.column)]
if self.oper == self.IsDefined:
return not np.isnan(value)
if self.oper == self.NotIsDefined:
return np.isnan(value)
if self.case_sensitive:
value = str(value)
refval = str(self.ref)
else:
value = str(value).lower()
refval = str(self.ref).lower()
if self.oper == self.Equal:
return value == refval
if self.oper == self.NotEqual:
return value != refval
if self.oper == self.Less:
return value < refval
if self.oper == self.LessEqual:
return value <= refval
if self.oper == self.Greater:
return value > refval
if self.oper == self.GreaterEqual:
return value >= refval
if self.oper == self.Contains:
return refval in value
if self.oper == self.NotContain:
return refval not in value
if self.oper == self.StartsWith:
return value.startswith(refval)
if self.oper == self.NotStartsWith:
return not value.startswith(refval)
if self.oper == self.EndsWith:
return value.endswith(refval)
if self.oper == self.NotEndsWith:
return not value.endswith(refval)
high = self.max if self.case_sensitive else self.max.lower()
if self.oper == self.Between:
return refval <= value <= high
if self.oper == self.Outside:
return not refval <= value <= high
raise ValueError("invalid operator")
[docs]
class FilterStringList(ValueFilter):
"""
Subfilter for strings variables which checks whether the value is in the
given list of accepted values.
.. attribute:: column
The column to which the filter applies (int, str or
:obj:`Orange.data.Variable`).
.. attribute:: values
The list (or a set) of accepted values.
.. attribute:: case_sensitive
Tells whether the comparisons are case sensitive
"""
def __init__(self, column, values, case_sensitive=True):
super().__init__(column)
self.values = values
self.case_sensitive = case_sensitive
@property
def values(self):
return self._values
@values.setter
def values(self, values):
self._values = values
self.values_lower = [x.lower() for x in values]
def __call__(self, inst):
value = inst[inst.domain.index(self.column)]
if self.case_sensitive:
return value in self._values
else:
return value.lower() in self.values_lower
[docs]
class FilterRegex(ValueFilter):
"""Filter that checks whether the values match the regular expression."""
def __init__(self, column, pattern, flags=0):
super().__init__(column)
self._re = re.compile(pattern, flags)
self.column = column
self.pattern = pattern
self.flags = flags
def __call__(self, inst):
return bool(self._re.search(inst or ''))