Source code for Orange.misc.distmatrix

import os.path

import numpy as np

from Orange.misc import _distmatrix_xlsx
from Orange.util import deprecated


[docs] class DistMatrix(np.ndarray): """ Distance matrix. Extends ``numpy.ndarray``. .. attribute:: row_items Items corresponding to matrix rows. .. attribute:: col_items Items corresponding to matrix columns. .. attribute:: axis If axis=1 we calculate distances between rows, if axis=0 we calculate distances between columns. """ def __new__(cls, data, row_items=None, col_items=None, axis=1): """Construct a new distance matrix containing the given data. :param data: Distance matrix :type data: numpy array :param row_items: Items in matrix rows :type row_items: `Orange.data.Table` or `Orange.data.Instance` :param col_items: Items in matrix columns :type col_items: `Orange.data.Table` or `Orange.data.Instance` :param axis: The axis along which the distances are calculated :type axis: int """ obj = np.asarray(data).view(cls) obj.row_items = row_items obj.col_items = col_items obj.axis = axis return obj def __array_finalize__(self, obj): # defined in __new___, pylint: disable=attribute-defined-outside-init """See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html""" if obj is None: return self.row_items = getattr(obj, 'row_items', None) self.col_items = getattr(obj, 'col_items', None) self.axis = getattr(obj, 'axis', 1) def __array_wrap__(self, out_arr, context=None): if out_arr.ndim == 0: # a single scalar return out_arr[()] return np.ndarray.__array_wrap__(self, out_arr, context) # __reduce__() and __setstate__() ensure DistMatrix is picklable. def __reduce__(self): state = super().__reduce__() newstate = state[2] + (self.row_items, self.col_items, self.axis) return state[0], state[1], newstate # noinspection PyMethodOverriding,PyArgumentList def __setstate__(self, state): # defined in __new___, pylint: disable=attribute-defined-outside-init self.row_items = state[-3] self.col_items = state[-2] self.axis = state[-1] super().__setstate__(state[0:-3]) @property @deprecated def dim(self): """Returns the single dimension of the symmetric square matrix.""" return self.shape[0] # noinspection PyPep8Naming @property @deprecated def X(self): return self @property def flat(self): return self[np.triu_indices(self.shape[0], 1)]
[docs] def submatrix(self, row_items, col_items=None): """ Return a submatrix Args: row_items: indices of rows col_items: incides of columns """ if not col_items: col_items = row_items obj = self[np.ix_(row_items, col_items)] if isinstance(self.row_items, list): obj.row_items = list(np.array(self.row_items)[row_items]) elif self.row_items is not None: obj.row_items = self.row_items[row_items] if self.col_items is self.row_items and col_items is row_items: obj.col_items = obj.row_items elif isinstance(self.col_items, list): obj.col_items = list(np.array(self.col_items)[col_items]) elif self.col_items is not None: obj.col_items = self.col_items[col_items] return obj
[docs] @classmethod def from_file(cls, filename, sheet=None): """ Load distance matrix from a file The file should be preferrably encoded in ascii/utf-8. White space at the beginning and end of lines is ignored. The first line of the file starts with the matrix dimension. It can be followed by a list flags - *axis=<number>*: the axis number - *symmetric*: the matrix is symmetric; when reading the element (i, j) it's value is also assigned to (j, i) - *asymmetric*: the matrix is asymmetric - *row_labels*: the file contains row labels - *col_labels*: the file contains column labels By default, matrices are symmetric, have axis 1 and no labels are given. Flags *labeled* and *labelled* are obsolete aliases for *row_labels*. If the file has column labels, they follow in the second line. Row labels appear at the beginning of each row. Labels are arbitrary strings that cannot contain newlines and tabulators. Labels are stored as instances of `Table` with a single meta attribute named "label". The remaining lines contain tab-separated numbers, preceded with labels, if present. Lines are padded with zeros if necessary. If the matrix is symmetric, the file contains the lower triangle; any data above the diagonal is ignored. Args: filename: file name """ _, ext = os.path.splitext(filename) if ext == ".xlsx": matrix, row_labels, col_labels, axis \ = _distmatrix_xlsx.read_matrix(filename, sheet) else: assert sheet is None matrix, row_labels, col_labels, axis = cls._from_dst(filename) return cls(matrix, cls._labels_to_tables(row_labels), cls._labels_to_tables(col_labels), axis)
@staticmethod def _labels_to_tables(labels): # prevent circular imports, pylint: disable=import-outside-toplevel from Orange.data import Table, StringVariable, Domain if labels is None or isinstance(labels, Table): return labels return Table.from_numpy( Domain([], metas=[StringVariable("label")]), np.empty((len(labels), 0)), None, np.array(labels)[:, None]) @classmethod def _from_dst(cls, filename): # prevent circular imports, pylint: disable=import-outside-toplevel from Orange.data.io import detect_encoding with open(filename, encoding=detect_encoding(filename)) as fle: line = fle.readline() if not line: raise ValueError("empty file") data = line.strip().split() if not data[0].strip().isdigit(): raise ValueError("distance file must begin with dimension") n = int(data.pop(0)) symmetric = True axis = 1 col_labels = row_labels = None for flag in data: if flag in ("labelled", "labeled", "row_labels"): row_labels = [] elif flag == "col_labels": col_labels = [] elif flag == "symmetric": symmetric = True elif flag == "asymmetric": symmetric = False else: flag_data = flag.split("=") if len(flag_data) == 2: name, value = map(str.strip, flag_data) else: name, value = "", None if name == "axis" and value.isdigit(): axis = int(value) else: raise ValueError(f"invalid flag '{flag}'") if col_labels is not None: col_labels = [x.strip() for x in fle.readline().strip().split("\t")] if len(col_labels) != n: raise ValueError("mismatching number of column labels, " f"{len(col_labels)} != {n}") def num_or_lab(n, labels): return f"'{labels[n]}'" if labels else str(n + 1) matrix = np.zeros((n, n)) for i, line in enumerate(fle): if i >= n: raise ValueError("too many rows") line = line.strip().split("\t") if row_labels is not None: row_labels.append(line.pop(0).strip()) if len(line) > n: raise ValueError( f"too many columns in matrix row " f"{num_or_lab(i, row_labels)}") for j, e in enumerate(line[:i + 1 if symmetric else n]): try: matrix[i, j] = float(e) except ValueError as exc: raise ValueError( "invalid element at " f"row {num_or_lab(i, row_labels)}, " f"column {num_or_lab(j, col_labels)}") from exc if symmetric: matrix[j, i] = matrix[i, j] return matrix, row_labels, col_labels, axis def auto_symmetricized(self, copy=False): def self_or_copy(): return self.copy() if copy else self def get_labels(labels): return np.array(labels) if isinstance(labels, list) \ else labels.metas[:, 0] if self._trivial_labels(labels) \ else object() h, w = self.shape m = max(w, h) if (abs(h - w) > 1 or self.row_items and self.col_items and np.any(get_labels(self.row_items) != get_labels(self.col_items)) or self.row_items and len(self.row_items) != m or self.col_items and len(self.col_items) != m): return self_or_copy() nans = np.isnan(self) low_indices = np.tril_indices(h, -1) low_empty = np.all(nans[low_indices]) high_indices = np.triu_indices(w, 1) high_empty = np.all(nans[high_indices]) if low_empty is high_empty: # both non-empty, or both empty (only diagonal) return self_or_copy() indices = low_indices if low_empty else high_indices if w == h: matrix = np.array(self) else: if low_empty: row = np.vstack((self[:, -1, None], [[0]])).T matrix = np.vstack((self, row)) else: col = np.hstack((self[-1, None], [[0]])).T matrix = np.hstack((self, col)) diag_indices = np.diag_indices(len(matrix)) matrix[diag_indices] = np.nan_to_num(matrix[diag_indices]) matrix[indices] = self.T[indices] return type(self)(matrix, self.row_items or self.col_items, self.col_items or self.row_items) def _trivial_labels(self, items): # prevent circular imports, pylint: disable=import-outside-toplevel from Orange.data import Table, StringVariable return (isinstance(items, (list, tuple)) and all(isinstance(item, str) for item in items) or isinstance(items, Table) and (self.axis == 0 or sum(isinstance(meta, StringVariable) for meta in items.domain.metas) == 1 ) ) def is_symmetric(self): # prevent circular imports, pylint: disable=import-outside-toplevel from Orange.data import Table if self.shape[0] != self.shape[1] or not np.allclose(self, self.T): return False if self.row_items is None or self.col_items is None: return True if isinstance(self.row_items, Table): return (isinstance(self.col_items, Table) and self.col_items.domain == self.row_items.domain and np.array_equal(self.col_items.X, self.row_items.X) and np.array_equal(self.col_items.Y, self.row_items.Y) and np.array_equal(self.col_items.metas, self.row_items.metas)) else: return (not isinstance(self.col_items, Table) and np.array_equal(self.row_items, self.col_items))
[docs] def has_row_labels(self): """ Returns `True` if row labels can be automatically determined from data For this, the `row_items` must be an instance of `Orange.data.Table` whose domain contains a single meta attribute, which has to be a string. The domain may contain other variables, but not meta attributes. """ return self._trivial_labels(self.row_items)
[docs] def has_col_labels(self): """ Returns `True` if column labels can be automatically determined from data For this, the `col_items` must be an instance of `Orange.data.Table` whose domain contains a single meta attribute, which has to be a string. The domain may contain other variables, but not meta attributes. """ return self._trivial_labels(self.col_items)
def get_labels(self, items): # prevent circular imports, pylint: disable=import-outside-toplevel from Orange.data import StringVariable if not self._trivial_labels(items): return None if isinstance(items, (list, tuple)) \ and all(isinstance(x, str) for x in items): return items if self.axis == 0: return [attr.name for attr in items.domain.attributes] else: string_var = next(var for var in items.domain.metas if isinstance(var, StringVariable)) return items.get_column(string_var) def save(self, filename): if os.path.splitext(filename)[1] == ".xlsx": _distmatrix_xlsx.write_matrix(self, filename) else: self._save_dst(filename) def _save_dst(self, filename): """ Save the distance matrix to a file in the file format described at :obj:`~Orange.misc.distmatrix.DistMatrix.from_file`. Args: filename: file name """ n = len(self) data = f"{n}\taxis={self.axis}" row_labels = col_labels = None if self.has_col_labels(): data += "\tcol_labels" col_labels = self.col_items if self.has_row_labels(): data += "\trow_labels" row_labels = self.row_items symmetric = self.is_symmetric() if not symmetric: data += "\tasymmetric" with open(filename, "wt", encoding="utf-8") as fle: fle.write(data + "\n") if col_labels is not None: fle.write("\t".join(str(e.metas[0]) for e in col_labels) + "\n") for i, row in enumerate(self): if row_labels is not None: fle.write(str(row_labels[i].metas[0]) + "\t") if symmetric: fle.write("\t".join(map(str, row[:i + 1])) + "\n") else: fle.write("\t".join(map(str, row)) + "\n")