"""Pandas DataFrame↔Table conversion helpers"""
from functools import partial
from itertools import zip_longest
import numpy as np
from scipy import sparse as sp
from scipy.sparse import csr_matrix
import pandas as pd
from pandas.core.arrays import SparseArray
import pandas.core.arrays.sparse.accessor
from pandas.api.types import (
is_object_dtype,
is_datetime64_any_dtype,
is_numeric_dtype,
is_integer_dtype,
)
from Orange.data import (
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
ContinuousVariable,
)
from Orange.data.table import Role
__all__ = ['table_from_frame', 'table_to_frame']
# Patch a bug in pandas SparseFrameAccessor.to_dense
# As of pandas=3.0.0.dev0+1524.g23c497bb2f, to_dense ignores _constructor
# and alwats returns DataFrame.
if pd.__version__ < "3":
def to_dense(self):
# pylint: disable=protected-access
data = {k: v.array.to_dense() for k, v in self._parent.items()}
constr = self._parent._constructor
return constr(data, index=self._parent.index, columns=self._parent.columns)
pandas.core.arrays.sparse.accessor.SparseFrameAccessor.to_dense = to_dense
class OrangeDataFrame(pd.DataFrame):
_metadata = ["orange_variables", "orange_weights",
"orange_attributes", "orange_role"]
def __init__(self, *args, **kwargs):
"""
A pandas DataFrame wrapper for one of Table's numpy arrays:
- sets index values corresponding to Orange's global row indices
e.g. ['_o1', '_o2'] (allows Orange to handle selection)
- remembers the array's role in the Table (attribute, class var, meta)
- keeps the Variable objects, and uses them in back-to-table conversion,
should a column name match a variable's name
- stores weight values (legacy)
Parameters
----------
table : Table
orange_role : Role, (default=Role.Attribute)
When converting back to an orange table, the DataFrame will
convert to the right role (attrs, class vars, or metas)
"""
if len(args) <= 0 or not isinstance(args[0], Table):
super().__init__(*args, **kwargs)
return
table = args[0]
if 'orange_role' in kwargs:
role = kwargs.pop('orange_role')
elif len(args) >= 2:
role = args[1]
else:
role = Role.Attribute
if role == Role.Attribute:
data = table.X
vars_ = table.domain.attributes
elif role == Role.ClassAttribute:
data = table.Y
vars_ = table.domain.class_vars
else: # if role == Role.Meta:
data = table.metas
vars_ = table.domain.metas
index = ['_o' + str(id_) for id_ in table.ids]
varsdict = {var._name: var for var in vars_}
columns = list(varsdict.keys())
if sp.issparse(data):
data = data.asformat('csc')
sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
data = dict(enumerate(sparrays))
super().__init__(data, index=index, **kwargs)
self.columns = columns
else:
copy = kwargs.pop("copy", False)
super().__init__(
data=data, index=index, columns=columns, copy=copy, **kwargs
)
self.orange_role = role
self.orange_variables = varsdict
self.orange_weights = (dict(zip(index, table.W))
if table.W.size > 0 else {})
self.orange_attributes = table.attributes
@property
def _constructor(self):
return partial(self.from_existing, self)
@staticmethod
def from_existing(existing, *args, **kwargs):
self = type(existing)(*args, **kwargs)
self.__finalize__(existing)
return self
def to_orange_table(self):
return table_from_frame(self)
def __finalize__(self, other, method=None, **_):
"""
propagate metadata from other to self
Parameters
----------
other : the object from which to get the attributes that we are going
to propagate
method : optional, a passed method name ; possibly to take different
types of propagation actions based on this
"""
if method == 'concat':
objs = other.objs
elif method == 'merge':
objs = other.left, other.right
else:
objs = [other]
orange_role = getattr(self, 'orange_role', None)
dicts = {dname: getattr(self, dname, {})
for dname in ('orange_variables',
'orange_weights',
'orange_attributes')}
for obj in objs:
other_role = getattr(obj, 'orange_role', None)
if other_role is not None:
orange_role = other_role
for dname, dict_ in dicts.items():
other_dict = getattr(obj, dname, {})
dict_.update(other_dict)
object.__setattr__(self, 'orange_role', orange_role)
for dname, dict_ in dicts.items():
object.__setattr__(self, dname, dict_)
return self
pd.DataFrame.__finalize__ = __finalize__
def _reset_index(df: pd.DataFrame) -> pd.DataFrame:
"""If df index is not a simple RangeIndex (or similar), include it into a table"""
if (
# not range-like index - test first to skip slow startswith(_o) check
not (
is_integer_dtype(df.index)
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
)
# check that it does not contain Orange index
and (
# startswith is slow (for long dfs) - firs check if col has strings
isinstance(df.index, pd.MultiIndex)
or not is_object_dtype(df.index)
or not any(str(i).startswith("_o") for i in df.index)
)
):
df = df.reset_index()
return df
def _is_discrete(s, force_nominal):
return (
isinstance(s.dtype, pd.CategoricalDtype)
or is_object_dtype(s)
and (force_nominal or s.nunique() < s.size**0.666)
)
def _is_datetime(s):
if is_datetime64_any_dtype(s):
return True
try:
if is_object_dtype(s):
# pd.to_datetime would successfully parse column of numbers to datetime
# but for column of object dtype with numbers we want to be either
# discrete or string - following code try to parse column to numeric
# if conversion to numeric is successful return False
try:
pd.to_numeric(s)
return False
except (ValueError, TypeError):
pass
# utc=True - to allow different timezones in a series object
pd.to_datetime(s, utc=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False
def _convert_datetime(series, var):
def col_type(dt):
"""Test if is date, time or datetime"""
dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False
if (dt_nonnat.dt.floor("D") == dt_nonnat).all():
# all times are 00:00:00.0 - pure date
return 1, 0
elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all():
# all dates are today's date - pure time
return 0, 1 # pure time
else:
# else datetime
return 1, 1
try:
dt = pd.to_datetime(series)
except ValueError:
# series with type object and different timezones will raise a
# ValueError - normalizing to utc
dt = pd.to_datetime(series, utc=True)
# set variable type to date, time or datetime
var.have_date, var.have_time = col_type(dt)
if dt.dt.tz is not None:
# set timezone if available and convert to utc
var.timezone = dt.dt.tz
dt = dt.dt.tz_convert("UTC")
if var.have_time and not var.have_date:
# if time only measure seconds from midnight - equal to setting date
# to unix epoch
return (
(dt.dt.tz_localize(None) - pd.Timestamp("now").normalize())
/ pd.Timedelta("1s")
).values
return (
(dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
).values
def to_categorical(s, _):
x = s.astype("category").cat.codes
# it is same than x.replace(-1, np.nan), but much faster
x = x.where(x != -1, np.nan)
return np.asarray(x)
def to_numeric(s, _):
return np.asarray(pd.to_numeric(s))
def vars_from_df(df, role=None, force_nominal=False, variables=None):
if variables is not None:
assert len(variables) == len(df.columns)
if role is None and hasattr(df, 'orange_role'):
role = df.orange_role
df = _reset_index(df)
cols = [], [], []
exprs = [], [], []
vars_ = [], [], []
def _convert_string(s, _):
return np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)
conversions = {
DiscreteVariable: to_categorical,
ContinuousVariable: to_numeric,
TimeVariable: _convert_datetime,
StringVariable: _convert_string
}
for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
s = df[column]
_role = Role.Attribute if role is None else role
if var is not None:
if not var.is_primitive():
_role = Role.Meta
expr = conversions[type(var)]
elif hasattr(df, 'orange_variables') and column in df.orange_variables:
original_var = df.orange_variables[column]
var = original_var.copy(compute_value=None)
expr = None
else:
if _is_datetime(s):
var = TimeVariable(str(column))
elif _is_discrete(s, force_nominal):
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
else:
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = conversions[type(var)]
cols[_role].append(column)
exprs[_role].append(expr)
vars_[_role].append(var)
xym = []
for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
if not a_cols:
arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
elif not any(a_expr):
# if all c in columns table will share memory with dataframe
a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
if all(isinstance(a, pd.SparseDtype) for a in a_df.dtypes):
arr = csr_matrix(a_df.sparse.to_coo())
else:
arr = np.asarray(a_df)
else:
# we'll have to copy the table to resolve any expressions
arr = np.array(
[
expr(df[col], var) if expr else np.asarray(df[col])
for var, col, expr in zip(a_vars, a_cols, a_expr)
]
).T
xym.append(arr)
# Let the tables share memory with pandas frame
if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
xym[1] = xym[1][:, 0]
return xym, Domain(*vars_)
[docs]
def table_from_frame(df, *, force_nominal=False, variables=None):
"""
Convert pandas DataFrame to Orange.data.Table.
Parameters
----------
df : pandas DataFrame
force_nominal : bool, (default=False)
Force all string variables to be nominal.
variables : list of Variable, optional
Returns
-------
Orange.data.Table
"""
XYM, domain = vars_from_df(df,
force_nominal=force_nominal,
variables=variables)
if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
if len(W) != len(df.index):
W = None
attributes = df.orange_attributes
if isinstance(df.index, pd.MultiIndex) or not is_object_dtype(df.index):
# we can skip checking for Orange indices when MultiIndex an when
# not string dtype and so speedup the conversion
ids = None
else:
ids = [
int(i[2:])
if str(i).startswith("_o") and i[2:].isdigit()
else Table.new_id()
for i in df.index
]
else:
W = None
attributes = None
ids = None
return Table.from_numpy(
domain,
*XYM,
W=W,
attributes=attributes,
ids=ids
)
def table_from_frames(xdf, ydf, mdf):
if not (xdf.index.equals(ydf.index) and xdf.index.equals(mdf.index)):
raise ValueError(
"Indexes not equal. Make sure that all three dataframes have equal index"
)
# drop index from x and y - it makes sure that index if not range will be
# placed in metas
xdf = xdf.reset_index(drop=True)
ydf = ydf.reset_index(drop=True)
dfs = xdf, ydf, mdf
if not all(df.shape[0] == xdf.shape[0] for df in dfs):
raise ValueError(f"Leading dimension mismatch "
f"(not {xdf.shape[0]} == {ydf.shape[0]} == {mdf.shape[0]})")
xXYM, xDomain = vars_from_df(xdf, role=Role.Attribute)
yXYM, yDomain = vars_from_df(ydf, role=Role.ClassAttribute)
mXYM, mDomain = vars_from_df(mdf, role=Role.Meta)
XYM = (xXYM[0], yXYM[1], mXYM[2])
domain = Domain(xDomain.attributes, yDomain.class_vars, mDomain.metas)
ids = [
int(idx[2:])
if str(idx).startswith("_o") and idx[2:].isdigit()
else Table.new_id()
for idx in mdf.index
]
attributes = {}
W = None
for df in dfs:
if isinstance(df, OrangeDataFrame):
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
if len(W) != len(df.index):
W = None
attributes.update(df.orange_attributes)
else:
W = None
return Table.from_numpy(
domain,
*XYM,
W=W,
attributes=attributes,
ids=ids
)
[docs]
def table_to_frame(tab, include_metas=False):
"""
Convert Orange.data.Table to pandas.DataFrame
Parameters
----------
tab : Table
include_metas : bool, (default=False)
Include table metas into dataframe.
Returns
-------
pandas.DataFrame
"""
def _column_to_series(col, vals):
result = ()
if col.is_discrete:
codes = pd.Series(vals).fillna(-1).astype(int)
result = (col.name, pd.Categorical.from_codes(
codes=codes, categories=col.values, ordered=True
))
elif col.is_time:
result = (col.name, pd.to_datetime(vals, unit='s').to_series().reset_index()[0])
elif col.is_continuous:
dt = float
# np.nan are not compatible with int column
# using pd.isnull since np.isnan fails on array with dtype object
# which can happen when metas contain column with strings
if col.number_of_decimals == 0 and not np.any(pd.isnull(vals)):
dt = int
result = (col.name, pd.Series(vals).astype(dt))
elif col.is_string:
result = (col.name, pd.Series(vals))
return result
def _columns_to_series(cols, vals):
return [_column_to_series(col, vals[:, i]) for i, col in enumerate(cols)]
x, y, metas = [], [], []
domain = tab.domain
if domain.attributes:
x = _columns_to_series(domain.attributes, tab.X)
if domain.class_vars:
y_values = tab.Y.reshape(tab.Y.shape[0], len(domain.class_vars))
y = _columns_to_series(domain.class_vars, y_values)
if domain.metas:
metas = _columns_to_series(domain.metas, tab.metas)
all_series = dict(x + y + metas)
all_vars = tab.domain.variables
if include_metas:
all_vars += tab.domain.metas
original_column_order = [var.name for var in all_vars]
unsorted_columns_df = pd.DataFrame(all_series)
return unsorted_columns_df[original_column_order]
def table_to_frames(table):
xdf = OrangeDataFrame(table, Role.Attribute)
ydf = OrangeDataFrame(table, Role.ClassAttribute)
mdf = OrangeDataFrame(table, Role.Meta)
return xdf, ydf, mdf
def amend_table_with_frame(table, df, role):
arr = Role.get_arr(role, table)
if arr.shape[0] != df.shape[0]:
raise ValueError(f"Leading dimension mismatch "
f"(not {arr.shape[0]} == {df.shape[0]})")
XYM, domain = vars_from_df(df, role=role)
if role == Role.Attribute:
table.domain = Domain(domain.attributes,
table.domain.class_vars,
table.domain.metas)
table.X = XYM[0]
elif role == Role.ClassAttribute:
table.domain = Domain(table.domain.attributes,
domain.class_vars,
table.domain.metas)
table.Y = XYM[1]
else: # if role == Role.Meta:
table.domain = Domain(table.domain.attributes,
table.domain.class_vars,
domain.metas)
table.metas = XYM[2]
if isinstance(df, OrangeDataFrame):
table.attributes.update(df.orange_attributes)