Source code for pyttop.table.table

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 30 2022

@author: Yu-Chen Wang

Main tools to store, operate and visualize data tables.
"""

import inspect
import io
import json
import multiprocessing as mp
import os
import pickle
import re
import warnings
import zipfile
from collections import OrderedDict, Counter
from collections.abc import Iterable
from copy import deepcopy
from difflib import get_close_matches
from functools import wraps
from itertools import repeat, chain
from keyword import iskeyword

import matplotlib.pyplot as plt
import numpy as np
from astropy.table import Column, Table, hstack
# from astropy.io import ascii as apascii

try:
    import pandas as pd # used to handle pd.DataFrame input
except ImportError:
    has_pd = False
else:
    has_pd = True

try:
    from tqdm import tqdm
except ImportError:
    has_tqdm = False
else:
    has_tqdm = True

from .. import __version__
from ..config import config
from ..plot import base as plot 
from ..utils import objdict, save_pickle, load_pickle, keyword_alias, bitwise_all, pause_and_warn, find_dup, SummaryDict, method_alias, omit_middle
from .exceptions import FailedToLoadError, SubsetError, SubsetInconsistentError, MergeError, SubsetMergeError, SubsetNotFoundError, GroupNotFoundError, ColumnNotFoundError

subplot_arrange = {
    1: [1, 1],
    2: [1, 2],
    3: [1, 3],
    4: [2, 2],
    5: [2, 3],
    6: [2, 3],
    }

plot_funcs = {
    'plot': plot.plot,
    'scatter': plot.scatter,
    'hist': plot.hist,
    'hist2d': plot.hist2d,
    'errorbar': plot.errorbar,
    }

plot_array_funcs = plot_funcs

# plot_array_funcs = {
#     'plot': lambda ax: ax.plot,
#     'scatter': scatter_with_colorbar, # lambda ax: ax.scatter,
#     'hist': lambda ax: ax.hist,
#     'hist2d': lambda ax: ax.hist2d,
#     'errorbar': lambda ax: ax.errorbar,
#     }


[docs]
class Subset():
    '''
    A class to specify a row subset of a ``pyttop.table.Data`` object.
    Although this class is independent to the ``Data`` class, it should be only used together with a ``Data`` object.

    The common way to specify the selection criteria, name, etc. of a subset is::

        Subset(<selection>, name=<name>, <...>)

    Convenient methods for specifying a subset are::

        Subset.by_range(<column name>=<value range>, <...>)
        Subset.by_value(<column name>, <value>)

    See :meth:`~Subset.by_range()` and :meth:`~Subset.by_value()` for more information.

    In practice, a subset of ``data`` is usually defined using the :meth:`~Data.add_subsets()` method::

        subset = data.add_subsets(Subset(<...>))

    You may also define multiple subsets at a time::

        subset1, subset2 = data.add_subsets(
            Subset(<...>),
            Subset(<...>))

    ``Subset`` objects can be used as if they are arrays (for most cases).
    For example, you can get the intersection set ``subset1 & subset2``,
    the union set ``subset1 | subset2``, and the complementary set ``~subset1``.

    Note that the name (will be auto-generated if not given) is used as the address of a subset in a ``data``.
    If you add a subset to a certain subset group in which the name is already used by another subset,
    the original subset will be replaced and no longer recognized as part of that ``data``::

        s1 = data.add_subsets(Subset(<...>, name='subset'))
        s2 = data.add_subsets(Subset(<...>, name='subset')) # this replaces the original subset at 'default/subset'
        s1 in data, s2 in data # (False, True)

    Parameters
    ----------
    selection : callable (e.g. function), iterable (e.g. array-like) or string
        If it is iterable, it should be a boolean array indicating whether each row is included in this subset.
        It should have a shape of ``(len(data),)`` where ``data`` is an ``pyttop.table.Data`` instance.

        If it is callable, it should be defined like below::

            def selection(table): # input: astropy.table.Table object
                <...>
                return arr # boolean array
                           # whether each row is included in subset

        If it is a string, should be an expression that can be evaluated by ``Data.eval``, e.g.
        ``'(column1 > 0) & (column2 < 1)'``. Refer to :meth:`~Data.eval` for details.
    name : str, optional
        The name of the subset. The default is None.
    expression : str, optional
        The expression [e.g. '(col1 > 0) & (col2 == "A")'] used to recognize the conditions.
        The default is None.
    label : str, optional
        The label used in figures.
        The default is None.
    kwargs :
        Arguments passed to ``Data.eval()`` if ``selection`` is evaluated as an expression.

    Notes
    -----
    The parameters ``selection``, ``name``, ``expression``, and ``label`` will become attributes of the ``Subset`` object.
    By defining a subset using ``data.add_subsets(Subset(<...>))``, they are evaluated given ``data``:

    - The attribute ``selection`` will be converted to a boolean array;
    - The attribute ``name`` will be set to the default name if it is None;
    - The attribute ``expression`` will be automatically set if it is None;
    - The attribute ``label`` will be set to ``name`` if it is None; strings will be replaced
      according to the mapping of dict ``data.col_labels``.

    If the input ``selection`` is/results in a masked (boolean) array, the masked elements
    are filled with False (which means that they are NOT included in this subset by definition).
    This often happens when ``selection`` is calculated from a masked column of the table.
    The final ``selection`` after evaluation is never a masked array.

    **Caveat**. Subsets constructed with ``expr`` and ``~expr`` are NOT necessarily complements of each other!
    See the example below::

        >>> from pyttop.table import Data, Subset
        >>> d = Data(name='test')
        >>> d['x'] = [-1, 1, -99]
        >>> d.mask_missing(missval=-99)
        [mask missing] col 'x': 1/3 (33.33%) masked (value: -99).
        >>> s1 = d.add_subsets(Subset('x < 0'))
        >>> s2 = d.add_subsets(Subset('~(x < 0)'))
        >>> s21 = d.add_subsets(Subset('x >= 0'))
        >>> s1, s1.selection
        (<Subset 'x < 0' of Data 'test' (1/3)>, array([ True, False, False]))
        >>> s2, s2.selection
        (<Subset '~(x < 0)' of Data 'test' (1/3)>, array([False,  True, False]))
        >>> s21, s21.selection
        (<Subset 'x >= 0' of Data 'test' (1/3)>, array([False,  True, False]))
        >>> (~s1), (~s1).selection
        (<Subset 'NOT(x < 0)' of Data 'test' (2/3)>, array([False,  True,  True]))
        >>> (~s2), (~s2).selection
        (<Subset 'NOT(~(x < 0))' of Data 'test' (2/3)>, array([ True, False,  True]))
        >>> (~s21), (~s21).selection
        (<Subset 'NOT(x >= 0)' of Data 'test' (2/3)>, array([ True, False,  True]))

    '''
    # Notes for developers
    # --------------------
    # Currently, the '&', '|', '~' operations can only be performed if selection is an boolean array,
    # or ``Subset.eval_`` is called. (This is always called when a subset is defined in ``data.add_subsets()``.)
    # Operation before evalutaion may be supported in the future.
    def __init__(self, selection, name=None, expression=None, label=None, **kwargs):
        '''
        Specify a subset of ``Data``.

        '''
        if name is not None and '/' in name:
            raise ValueError('"/" not supported in Subset names')

        self.selection = selection
        self.name = name
        self.expression = expression
        self.label = label
        # self.data_name = None
        self._data = None
        self.kwargs = kwargs

    @property
    def data(self):
        return self._data

    @property
    def data_name(self):
        if self.data is None:
            return None
        else:
            return self.data.name


[docs]
    @classmethod
    def by_range(cls, **ranges):
        '''
        Initializes a subset by specifying ranges for the data.

        For example, ``Subset.by_range(col1=[0, 1], col2=[0, np.inf])``
        defines a subset with `(0 < col1 < 1) & (col2 > 0)`.

        Parameters
        ----------
        **ranges : key - value pairs:
            key : str
                Name of the column in the data.
            value : list or tuple (or other similar objects) with length=2
                List of 2 numbers, e.g. ``[0, 1]``, specifying a range of the column.

        Returns
        -------
        ``pyttop.table.Subset``

        '''
        # get Subset from range
        def selection(t):
            selected = True
            for col, range_ in ranges.items():
                selected &= (t[col] > range_[0]) & (t[col] < range_[1])
            return selected # the boolean array
        name = '&'.join([f'{col}({range_[0]}-{range_[1]})' for col, range_ in ranges.items()])
        expression = ' & '.join([f'({col} > {range_[0]}) & ({col} < {range_[1]})' for col, range_ in ranges.items()])
        label = ', '.join([f'{col}$\\in$({range_[0]}, {range_[1]})' for col, range_ in ranges.items()])

        name = Subset._remove_slash(name)
        return cls(selection, name=name, expression=expression, label=label)



[docs]
    @classmethod
    def by_value(cls, column, value):
        '''
        Initializes a subset by specifying the exact value of column.

        Parameters
        ----------
        column : str
            Name of the data column.
        value :
            Value of the column.

        Returns
        -------
        ``pyttop.table.Subset``

        '''
        def selection(t):
            return t[column] == value # the boolean array
        name = f'{column}={value}'
        expression = name
        label = value if type(value) in [str, np.str_] else f'{column}$=$' + '$\\mathrm{' + f'{value}' + '}$'

        name = Subset._remove_slash(name)
        return cls(selection, name=name, expression=expression, label=label)


    @staticmethod
    def _remove_slash(name, rep='_'):
        if '/' in name:
            oldname = name
            name = oldname.replace('/', rep)
            warnings.warn(f"'/' is not supported as a subset name. '{oldname}' renamed to '{name}'.")
        return name


[docs]
    def eval_(self, data, existing_keys=()):
        '''
        Evaluate the selection array, expression, name and label, given data.
        This method should be executed if, self.selection is not a boolean array
        OR either self.name or self.expression is None.

        Note that if a subset is added to ``data`` using ``data.add_subset(Subset(<...>))``,
        this method is already called, and do not need to be called again.

        Parameters
        ----------
        data : ``pyttop.table.Data``

        existing_keys : Iterable, optional
            Names of subsets that already exists.
            This is used to automatically generate subset names.
            The default is ().
        '''
        # detects dangerous operations of re_evaluating a subset with a different data while it is still recorded in a data
        if (self.data is not None and self.data is not data # re-evaluating?
            and self in self.data # but self is still in self.data?
            ):
            raise SubsetInconsistentError(f"{self} already added to {self.data}")

        # self.data_name = data.name
        self._data = data

        # get selection array and expression
        if callable(self.selection):
            if self.expression is None:
                self.expression = inspect.getsource(self.selection)
            self.selection = self.selection(data.t)
            # if self.selection.dtype != bool:
            #     raise TypeError(f"selection function must return ")
        elif type(self.selection) is str:
            if self.expression is None:
                self.expression = self.selection
            if self.name is None:
                # if '/' in self.expression:
                #     msg = f"failed to set subset name"
                #     warnings.warn(msg)
                self.name = self.expression

            # if self.selection in ['all', 'All']:
            #     self.selection = np.full(len(data), True)
            # else:
            self.selection = data.eval(self.selection, **self.kwargs)

                ### old implementation below:
                # # check string: avoid error if the string contains something like "self", "data" but are not real column names
                # names = list(chain(locals(), globals()))
                # for name in names:
                #     if name in ['np', 'os']:
                #         continue
                #     if name in self.selection and name not in data.colnames:
                #         raise KeyError(name)

                # for colname in data.colnames: # replace colnames to expression
                #     self.selection = self.selection.replace(colname, f"data.t['{colname}']")

                # try:
                #     print('[subset] evaluating', self.selection)
                #     self.selection = eval(self.selection)
                # # except NameError as e:
                # #     raise KeyError(e.name)
                # except Exception as e:
                #     raise FailedToEvaluateError(f"Auto-generated expression cannot be evaluated: ({self.selection}). Check your input (see above for error) or try other methods to specify a subset.") from e
                # except:
                #     raise
                ### end

        elif isinstance(self.selection, Iterable):
            if self.expression is None:
                self.expression = '<array>'

        else:
            raise TypeError(f"selection of Subset should be function, array-like object or string, got '{type(self.selection)}'.")

        # make selection a boolean array (if not)
        if not (isinstance(self.selection, Iterable) and not isinstance(self.selection, str)):
            raise TypeError(f"selection string or function should return a boolean array, got '{type(self.selection)}'.")
        # if len(self.selection) != len(data):
        #     raise ValueError(f'length of array should be {len(data)}, got {len(self.selection)}.')
        if not isinstance(self.selection, np.ndarray):
            self.selection = np.array(self.selection)
            # else: if it is masked, keep it masked (otherwise np.array(self.selection) will get its data only)
        if self.selection.shape != (len(data),):
            msg = f"bad selection string/function/array: excepted shape is {(len(data),)}, got {self.selection.shape}"
            raise ValueError(msg)
        if self.selection.dtype != bool:
            warnings.warn('selection array is not boolean: converted to boolean array',
                          stacklevel=3)
            self.selection = self.selection.astype(bool)
        self.selection = self.selection.copy() # TODO: this was initially used for array-like object as input of __init__(). can it be improved?

        # directly fill masked to False (this makes a difference when using e.g. __or__, __invert__)
        # if np.ma.is_masked(self.selection): # this returns False unless the input is a MaskedArray *containing masked values*
        if isinstance(self.selection, np.ma.MaskedArray): # this makes sure the selection of Subset is never masked array
            self.selection = self.selection.filled(False) # IMPORTANT: Masked elements do NOT belong to this subset!

        # get name
        if self.name is None:
            i = 0
            while f'subset{i}' in existing_keys:
                i += 1
            self.name = f'subset{i}'

        # get label
        if self.label is None:
            self.label = self.name

        # replace colname with label
        # TODO: this is not robust. This should be used to modify the labels for subset
        # initialized by Subset.by_range and Subset.by_value, because they do not know the labels during
        # initialization. If a subset is initialized by Subset() and label is given by the user,
        # it should not be modified.
        for colname, labelstr in data.col_labels.items():
            self.label = self.label.replace(colname, labelstr)

        # remove '/' in name
        # '/' may be present in name when setting `self.name = self.expression` and `'/' in self.expression`.
        if '/' in self.name:
            self.name = self.name.replace('/', '(slash)')

        # check label
        if self.label in ['All', 'all'] and not np.all(self.selection):
            warnings.warn(
                f"{self} is not the entire set but the label is '{self.label}'",
                stacklevel=3)



    def _cut(self, index, new_data=None):
        # return a cut Subset (cut with ``index``)
        cut_subset = Subset(
            self.selection[index], # Note: this is not a copy of self.selection. This is not a problem, since the array it refers to is never modified in the code.
            self.name,
            self.expression,
            self.label,
            )
        # cut_subset.data_name = data_name
        if new_data is not None:
            cut_subset._data = new_data
        return cut_subset

    @property
    def size(self): # the size of the subset
        return np.sum(np.array(self))


[docs]
    def eqs(self, subset):
        '''
        Checks if selections of two subsets are the same. For example::

            if subset1.eqs(subset2):
                print('same')

        Parameters
        ----------
        subset : `pyttop.table.Subset`

        Returns
        -------
        bool
        '''
        if self.data is not subset.data:
            raise ValueError('comparing subsets of different data')
        return np.all(np.array(self) == np.array(subset))


    @staticmethod
    def _merge_data_info(method):
        # this is a decorator
        # merge data info of the two subsets for binary operators
        @wraps(method)
        def new_method(self, subset):
            # data_recorded = self.data is not None and subset.data is not None
            # if ((data_recorded and self.data is not subset.data)
            #     or self.data_name != subset.data_name):
            if self.data is not subset.data:
                warnings.warn(f"trying to broadcast together subsets of two different Data: \n{self} and {subset}",
                              stacklevel=2 + 1) # add 1 when using @_check_operand
            new_subset = method(self, subset)
            # if self.data_name == subset.data_name:
            #     new_subset.data_name = self.data_name
            # if data_recorded and self.data is subset.data:
            if self.data is subset.data:
                new_subset._data = self.data
            return new_subset
        return new_method

    @staticmethod
    def _check_operand(method):
        # this is a decorator
        # check if operand is Subset
        @wraps(method)
        def new_method(self, subset):
            if not isinstance(subset, self.__class__):
                return NotImplemented
            return method(self, subset)
        return new_method

    def _isall(self):
        # used in __and__ when combining labels
        # 0.4.3 update: consider better ways to decide whether it is "all" (in case, e.g., a subset labelled "All" is not all)
        # return self.label == 'All' # old definition
        # return self.label == 'All' and np.all(self.selection) # stricter condition
        return self is self.data.get_subsets('default/all') # strictest condition

    @_check_operand
    @_merge_data_info
    def __and__(self, subset): # the & (bitwise AND)
        selection = self.selection & subset.selection
        name = f'{self.name} AND {subset.name}'
        expression = f'({self.expression}) AND ({subset.expression})'
        if (not self._isall()) and (not subset._isall()):
            label = f'{self.label}, {subset.label}'
        elif self._isall():
            label = f'{subset.label}'
        else: # subset._isall()
            label = f'{self.label}'

        new_subset = Subset(selection, name, expression, label)
        return new_subset

    @_check_operand
    @_merge_data_info
    def __or__(self, subset): # the | (bitwise OR)
        selection = self.selection | subset.selection
        name = f'{self.name} OR {subset.name}'
        expression = f'({self.expression}) OR ({subset.expression})'
        label = f'[{self.label}] or [{subset.label}]'

        new_subset = Subset(selection, name, expression, label)
        return new_subset

    def __invert__(self): # the ~ (bitwise NOT)
        selection = ~self.selection
        name = f'NOT({self.name})'
        expression = f'NOT({self.expression})'
        label = f'not [{self.label}]'

        new_subset = Subset(selection, name, expression, label)
        # new_subset.data_name = self.data_name
        new_subset._data = self.data
        return new_subset

    def __array__(self):
        # if not hasattr(self.selection, 'dtype') or self.selection.dtype != bool: #not isinstance(self.selection, Iterable):
            # raise TypeError('Selection should be a boolean array. Maybe forgot to run eval_()?')
        if not isinstance(self.selection, np.ndarray) or self.selection.dtype != bool:
            raise TypeError('selection should be a boolean array')
        if np.ma.is_masked(self.selection): # this never happens after I directly fill masked to False. This is kept to handle instances of old versions.
            return self.selection.filled(False) # IMPORTANT: Masked elements do NOT belong to this subset!
        else:
            return self.selection

    def __len__(self):
        # this is actually (and should be) the same as len(data).
        return len(np.array(self))

    def __repr__(self):
        # return f"Subset('{self.selection}')"
        # return f"Subset(name='{self.name}', selection={self.selection.__repr__()})"
        namestr = f"Subset '{self.name}'" if self.name is not None else 'Unnamed Subset'
        if self.data is not None:
            datastr = " of Data "
            short_data_name = self.data._short_name
            datastr += f"'{short_data_name}'" if short_data_name is not None else 'without name'
        else:
            datastr = ''
        try:
            fracstr = f' ({self.size}/{len(self)})'
        except TypeError: # from __array__
            fracstr = ''
        return f"<{namestr}{datastr}{fracstr}>"

    def __getstate__(self):
        state = self.__dict__.copy()
        del state['_data'] # data should not be pickled
        return state

    def __setstate__(self, state):
        # Call __init__() to initialize some attributes in case not provided by state.
        # This can be useful when restoring Subset from a pkl file of an older version (that may lack ``self.data_name``).
        # Also, ``self.data`` is not pickled and will be initialized as None.
        self.__init__(None)
        # Restore instance attributes
        self.__dict__.update(state)




[docs]
@method_alias
class Data(plot.PlotMethodsMixin):
    '''
    A class to store, manipulate and visualize data tables.

    Parameters
    ----------
    data : str, file-like, astropy.table.Table, pandas.DataFrame, or similar
        The data table, which can be one of the following:
            
        - A string path to a data file
        - A file-like object (e.g., returned by ``open()``)
        - An ``astropy.table.Table`` object
        - A ``pandas.DataFrame``, or any object that can be initialized as an ``astropy.table.Table``
    name : str, optional
        The name of this Data object. This name will be used in many cases to distinguish datasets. The default is None.
    **kwargs :
        Additional keyword arguments passed when initializing an ``astropy.table.Table`` object.
        
        Common arguments include:

        format : str, optional
            File format specifier for ``astropy.table.Table.read()`` 
            (relevant when reading from a file path or file-like object). 
            For a list of supported formats see the 
            `Astropy documentation <https://docs.astropy.org/en/stable/io/unified_table.html#built-in-table-readers-writers>`_.

    Notes
    -----
    - The data table of a ``Data`` instance (i.e. ``data.t``) is not expected to be changed since creation.
      If ``data.t`` is changed, the matching and subset information may be inconsistent with the table.
      Create a new ``Data`` instance instead.

    Attributes
    ----------
    t : ``astropy.table.Table``
        The table.
    colnames : list
        A list of column names.
    shape : tuple
        ``(<number_of_rows>, <number_of_columns>)``
    '''
    def __init__(self, data=None, name=None, **kwargs):
        '''
        '''
            # Keyword arguments passed to ``astropy.table.Table.read()`` (if a str is passed to argument `data`),
            # or ``astropy.table.Table()`` (if applicable).

        if isinstance(data, self.__class__):
            raise TypeError('input is already a Data object')

        if type(data) is str and 'format' in kwargs and kwargs['format'] in ['data', 'pkl']: # should use Data.load
            raise ValueError(f"to load data file saved with Data.save, use Data.load('{data}', format='{kwargs['format']}')")

        if name is None:
            warnings.warn('It is recommended to input a name.',
                          stacklevel=2)

        # TODO: save a pkl (or other formats) file while reading an ascii file,
        # so that the next time this ascii file is read (if not modified), use the pkl
        # file to accelerate data loading process.

        # get data
        if type(data) is str: # got a path
            self.t = Table.read(data, **kwargs)
            self._path = data
        elif isinstance(data, io.IOBase): # got a file-like object
            self.t = Table.read(data, **kwargs) # see astropy.io.registry.core.UnifiedInputRegistry.read
            try:
                self._path = data.name
            except AttributeError: # the object does not have a name (e.g., BytesIO)
                self._path = f'(initalized from a {type(data)} object)'
        elif isinstance(data, Table): # got astropy table
            self.t = data
            self._path = '(initialized from Table)'
        # for large ascii files, loading with pd abd converting it to astropy.table.Table seems to be faster
        elif has_pd and type(data) == pd.DataFrame:
            self.t = Table.from_pandas(data, **kwargs)
            self._path = '(initialized from DataFrame)'
        else: # try to convert to data
            self.t = Table(data, **kwargs)
            self._path = f'(initalized from a {type(data)} object)'
        self.meta['path'] = self._path

        # basic properties
        self.name = name
        if self.name is None and self._path is not None:
            self.name = self._path.split('/')[-1].split('\\')[-1]
        # self.id = time.time() #time.strftime('%y%m%d%H%M%S')

        # set metadata for columns (TODO: experimental feature)
        for colname in self.colnames:
            col = self.t[colname]
            meta_keys = ['src', 'src_detail', 'set_by_user']
            if all(key not in col.meta.keys() for key in meta_keys):
                col.meta['src'] = self.name
                col.meta['src_detail'] = f'{self._path}'
                col.meta['set_by_user'] = False # whether the value of this column is modified by the user.
                    # TODO: CAUTION: modification can only be detected when using data[] instead of data.t[]

        # if type(data) is str: # got a path
        #     for colname in self.colnames:
        #         col = self.t[colname]
        #         if 'src' not in col.meta.keys():
        #             col.meta['src'] = self.name
        #             col.meta['src_detail'] = f'Loaded "{self._path}"'
        #             col.meta['set_by_user'] = False # whether the value of this column is modified by the user.
        # else:
        #     pass # TODO: not safe to set metadata for other input format, as they may have their own metadata

        # matching
        self.matchinfo = []
        self.matchnames = []
        self.matchlog = []

        # subset
        self.subset_groups = {
            'default': {'all': self._gen_subset_all()}
            }

        # plot
        self.col_labels = {} # {column_name: label_in_figures}
        self.plot_axes = None # the axes for the last plot
        self.plot_fig = None # the fig for the last plot
        self.plot_returns = [] # the returns of the last plot

    #### properties

    @property
    def colnames(self):
        return self.t.colnames

    @property
    def meta(self):
        # notes: the table file (loaded from other formats) itself can contain its own metadata, so do not clear it.
        return self.t.meta

    @property
    def path(self):
        return self._path

    @property
    def shape(self):
        # return n_row, n_column
        # this is compatible with spyder_kernels.utils.nsview.get_size (priority: shape -> size -> __len__)
        return len(self), len(self.colnames)

    #### matching & merging


[docs]
    def match(self, data1, matcher, verbose=True, replace=False):
        '''
        Match this data object with another `pyttop.table.Data` object `data1`.

        Parameters
        ----------
        data1 : `pyttop.table.Data`
            Data to be matched to this Data.
        matcher : any recognized matcher object
            A matcher object used to match the two data objects.
            Built-in matchers includes, e.g., :class:`~pyttop.matcher.ExactMatcher` and :class:`~pyttop.matcher.SkyMatcher`.

            A matcher object should be defined like below::

                class MyMatcher():
                    def __init__(self, args): # 'args' means any number of arguments that you need
                        # initialize it with args you need
                        pass

                    def get_values(self, data, data1, verbose=True): # data1 is matched to data
                        # prepare the data that is needed to do the matching (if necessary)
                        pass

                    def match(self):
                        # do the matching process and calculate:
                        # idx : array of shape (len(data), ).
                        #     the index of a record in data1 that best matches the records in data
                        # matched : boolean array of shape (len(data), ).
                        #     whether the records in data can be matched to those in data1.
                        return idx, matched

        verbose : bool, optional
            Whether to output matching information. The default is True.
        replace : bool, optional
            When ``data1`` (Data to be matched) has the same name as a Data object that has already been matched to this Data,
            whether to replace the old matching.
            If False, a ValueError is raised.
            The default is False.

        Raises
        ------
        ValueError
            - Data with the same name to be matched to this Data twise.

        Returns
        -------


        '''
        if not (isinstance(data1, Data) or type(data1) == type(self)):
            raise TypeError(f"only supports matching 'pyttop.table.Data' type; got {type(data1)}")
        if inspect.isclass(matcher):
            try:
                matcher = matcher()
            except TypeError as e:
                raise TypeError(f"{matcher.__name__} is not instantiated") from e

        if data1.name in self.matchnames:
            if replace:
                self.unmatch(data1)
            else:
                raise ValueError(f"Data with name '{data1.name}' has already been matched. This may result from name duplicates or re-matching the same catalog. "
                                 f"Set 'replace=True' to replace the existing match with '{data1.name}'.")
                # names are currently used as IDs in the context of matching and merging, so any name conflict is not allowed.

        matcher.get_values(self, data1, verbose=verbose)
        idx, matched = matcher.match()
        info = objdict(
            matcher = matcher,
            data1 = data1,
            idx = idx,
            matched = matched,
            )
        self.matchinfo.append(info)
        self.matchnames.append(data1.name)

        matchstr = f'"{data1._short_name}" matched to "{self._short_name}": {np.sum(matched)}/{len(matched)} matched.'
        self.matchlog.append(matchstr)
        if verbose: print('[match] ' + matchstr)

        return self



[docs]
    def unmatch(self, data1, verbose=True):
        '''
        Remove the match of ``data1``.

        Parameters
        ----------
        data1 : ``pyttop.table.Data`` or str
            The Data or the name of the Data.
        verbose : bool, optional
            Whether to output information. The default is True.

        Returns
        -------
        None.

        '''
        # warnings.warn('Data.unmatch method not tested')
        if (isinstance(data1, Data) or type(data1) == type(self)):
            name1 = data1.name
        elif type(data1) is str:
            name1 = data1
        else:
            raise TypeError(f"only supports 'pyttop.table.Data' or str; got {type(data1)}")

        if name1 not in self.matchnames:
            warnings.warn(f"Data with name '{data1.name}' has never been matched. Nothing is done.")
            return

        self.matchinfo = [info for info in self.matchinfo if info.data1.name != name1]
        self.matchnames.remove(name1)

        unmatchstr = f'"{name1}" unmatched to "{self.name}".'
        if verbose: print('[match] ' + unmatchstr)
        self.matchlog.append(unmatchstr)



[docs]
    def reset_match(self):
        '''
        Remove all match information.

        '''
        self.matchinfo = []
        self.matchlog = []
        self.matchnames = []


    def _match_propagate(self, idx=None, matched=None, depth=-1, ignore_id=None, tree=None):
        '''
        Propagate all of the match to self's "child" data to self's "parent" data.

        Parameters
        ----------
        idx : Iterable, optional
            'idx' information for self matched to parent data. The default is None.
        matched : Iterable, optional
            'matched' information for self matched to parent data. The default is None.
        depth : int, optional
            Depth. The default is -1.
        ignore_id : Iterable, optional
            Data id to be ignored. The default is [].
        tree : dict, optional
            The returned ``tree`` of self._match_tree, used to decide whether a data should be merged.

        Returns
        -------
        idxs, matcheds : list
            The 'idx', 'matched' information for (all of self's child data) matched to (self's parent data).

        '''
        if ignore_id is None: ignore_id = []
        data1s, idxs, matcheds, depths, has_child = [], [], [], [], []

        if tree and not tree[self]['merge']: # myself should not be merged?! do nothing
            return data1s, idxs, matcheds, ignore_id
        childs = tree[self]['child'] if tree else None

        ## below: myself should be merged
        if id(self) not in ignore_id:
            ignore_id.append(id(self))
        elif tree:
            raise RuntimeError('this should not happen')

        if idx is None:
            idx = np.arange(len(self))
        if matched is None:
            matched = np.full((len(self),), True)

        if depth != 0:
            for info in self.matchinfo: # analyze the child data of self
                data1 = info.data1
                if childs: assert childs[data1]['matcher'] is info.matcher
                if id(data1) in ignore_id or (childs and not childs[data1]['merge']):
                    continue

                # get match info for data1  (self's parent to self's child "data1")
                idx_s = info.idx # _s: self - child match
                matched_s = info.matched

                idx_temp = idx.copy()
                # l_p = len(idx) # length of parent data
                idx_temp[~matched] = 0
                idx_ps = idx_s[idx_temp] # _ps: parent - child match
                matched_ps = matched_s[idx_temp]
                matched_ps &= matched
                idx_ps[~matched_ps] = -len(data1) - 1

                data1s.append(data1)
                # ignore_id.append(id(data1)) # this is done in data1._match_propagate
                idxs.append(idx_ps)
                matcheds.append(matched_ps)
                depths.append(depth-1)

                # ask data1 to give me all of its child data
                data1_data1s, data1_idxs, data1_matcheds, data1_depths, data1_has_child, ignore_id = data1._match_propagate(idx=idx_ps, matched=matched_ps, depth=depth-1, ignore_id=ignore_id, tree=childs)
                # if len(data1_data1s) == 0: # data1 has no children
                #     has_child.append(False)
                # else:
                #     has_child.append(True)
                has_child.append([d.name for d in data1_data1s])
                data1s += data1_data1s
                idxs += data1_idxs
                matcheds += data1_matcheds
                depths += data1_depths
                has_child += data1_has_child

        return data1s, idxs, matcheds, depths, has_child, ignore_id


[docs]
    def merge_matchinfo(self, depth=-1):
        '''
        Merge the matchinfo for all of the children data of this data,
        so that each info is the match with repect to **this** data.
        If there are duplicates in the child data, only the first found is used.

        Parameters
        ----------
        depth : int, optional
            The depth of merging.
            For example, if depth == 1, only the direct children (without grandchildren) of
            this data are merged.
            if depth == -1, all children (including all grandchildren) are merged.
            The default is -1.

        Returns
        -------
        outinfo : list of objdicts
            .

        '''
        outinfo = []
        tree, _, _ = self._match_tree(depth=depth)
        data1s, idxs, matcheds, depths, has_child, _ = self._match_propagate(depth=depth, tree=tree)
        for data1, idx, matched, depth, has_childi in zip(data1s, idxs, matcheds, depths, has_child):
            outinfo.append(objdict(
                matcher = None,
                data1 = data1,
                idx = idx,
                matched = matched,
                depth=depth,
                has_child=has_childi,
                ))
        return outinfo


    @staticmethod
    def _cut_subset_groups(subset_groups, index, new_data=None):
        # cut each subset with [index]
        subset_groups = deepcopy(subset_groups)
        for group in subset_groups:
            for subset in subset_groups[group]:
                subset_groups[group][subset] = subset_groups[group][subset]._cut(index, new_data)
        return subset_groups

    @staticmethod
    def _mask_subset_groups(subset_groups, index, in_place=True, val=False):
        # change all subsets' [index] values to val (False).
        if not in_place:
            subset_groups = deepcopy(subset_groups)
        for group in subset_groups:
            for subset in subset_groups[group]:
                subset_groups[group][subset].selection[index] = val
        return subset_groups


    @staticmethod
    def _merge_subset_groups(data_subset_groups, data_names, verbose=True):
        # get all
        all0 = data_subset_groups[0]['default']['all']
        l = len(all0)
        subset_all = Subset(np.ones(l).astype(bool), name='all', expression='all', label='All') # subset named "all"
        # subset_all.data_name = all0.data_name

        merged_subset_groups = {
            'default': {'all': subset_all},
            }
        # data_groupnames =

        ## merge 'default' group ##
        for i in range(len(data_names)):
            datai_subset_groups, data_name = data_subset_groups[i], data_names[i]
            datai_default = datai_subset_groups['default']
            other_names = [list(g['default']) for g in data_subset_groups]
            other_names.pop(i)
            for name, subset in datai_default.items():
                outname = name
                if name == 'all':
                    # if 'all' not in merged_subset_groups['default']:
                    #     merged_subset_groups['default']['all'] = subset
                    continue
                if any(name in other_namesi for other_namesi in other_names):
                    outname = '_'.join([name, data_name])
                    if verbose:
                        print(f"[merge] subset renamed: {name} -> {outname}")
                if outname in merged_subset_groups['default']:
                    raise SubsetMergeError(f"Subset merging results in name duplicates: '{outname}'. "
                                           f"You may change the name of your data to avoid duplicate Data names ('{data_name}').")
                merged_subset_groups['default'][outname] = subset

        ## merge other groups ##
        for i in range(len(data_names)):
            datai_subset_groups, data_name = data_subset_groups[i], data_names[i]
            other_names = [list(g) for g in data_subset_groups]
            other_names.pop(i)
            for name, group in datai_subset_groups.items():
                outname = name
                if name == 'default':
                    continue
                if any(name in other_namesi for other_namesi in other_names):
                    outname = '/'.join([data_name, name])
                    if verbose:
                        print(f"[merge] group renamed: {name} -> {outname}")
                if outname in merged_subset_groups:
                    raise SubsetMergeError(f"Subset merging results in name duplicates: '{outname}'. "
                                           f"You may change the name of your data to avoid duplicate Data names ('{data_name}').")
                merged_subset_groups[outname] = group

        return merged_subset_groups

    @staticmethod
    def _decide_missing_value(col):
        # decide the value representin missing values
        # refer to https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind
        kind = col.dtype.kind
        if np.ma.is_masked(col):
            col = col[~col.mask]

        miss = None
        # TODO: below
        if kind in 'b': # boolean
            pass
        elif kind in 'i': # signed int
            miss = -99
            while np.min(col) < miss/9:
                miss = (miss - 1)*10 + 1
        elif kind in 'u': # unsigned int
            miss = 99
            while np.max(col) > miss/9:
                miss = (miss + 1) * 10 - 1
        elif kind in 'f': # float
            miss = np.nan
        elif kind in 'c': # complex float
            pass
        elif kind in 'm': # timedelta
            pass
        elif kind in 'M': # datetime
            pass
        elif kind in 'O': # object
            miss = 'N/A'
        elif kind in 'S': # (byte-)string
            n = col.dtype.itemsize
            if n == 1:
                return '?'
            elif n == 2:
                return 'NA'
            else:
                return 'N/A'
        elif kind in 'U': # Unicode
            n = col.dtype.itemsize / 4
            if n == 1:
                return '?'
            elif n == 2:
                return 'NA'
            else:
                return 'N/A'
        elif kind in 'V': # void
            pass
        return miss


[docs]
    def merge(self, depth=-1, keep_unmatched=[],
              merge_columns={}, ignore_columns={},
              innames={}, outname=None,
              keep_subsets=False, matchinfo_subset=False,
              verbose=True):
        '''
        Merge all data objects that are matched to this data.

        The data that are directly matched to this data are called "chilren" of this data, and are on "depth 1".
        The data directly matched to data on "depth 1" are on "depth 2", etc.

        Parameters
        ----------
        depth : int, optional
            The depth of merging.
            For example, if ``depth == 1``, only the direct children (without grandchildren) of
            this data are merged.
            if ``depth == -1``, all children (including all grandchildren) are merged.
            The default is -1.
        keep_unmatched : Iterable or True, optional
            A list of names of `pyttop.table.Data` objects (you can check the names with e.g. `data.name`).
            A record (row) of THIS data is kept even if a dataset in the above list cannot be matched to this data.
            To set ``keep_unmatched`` for all data, pass ``keep_unmatched=True``.
            The default is [] (which means that only those that can be matched to each child data of this data are kept).
        merge_columns : dict, optional
            A dict that specifies fields (columns) to be merged.
            For example, if ``data1`` with name 'Data_1' is matched to this object, and you want to merge only
            'column1', 'column2' in ``data1`` into the merged catalog, use::

                {'Data_1': ['column1', 'column2']}

            If, e.g, ``merge_columns`` for ``data2`` (with name 'Data_2') is not specified, every fields (columns) of ``data2`` will be merged.
            The list can also include regular expressions, e.g.::
                
                # requires `import re`
                {'Data_1': ['column1', re.compile('class.*')]}
                
            The default is {}.
        ignore_columns : dict, optional
            A dict that specifies fields (columns) not to be merged.
            Similar to argument ``merge_columns``.
            If both ``merge_columns`` and ``ignore_columns`` are specified for a field,
            the columns IN ``merge_columns`` AND NOT IN ``ignore_columns`` are merged.
            The default is {}.
        innames : dict, optional
            A dict in the form of ``{data_name: rename_name}``.
            This is used to generate unique output column names in case of conflicts (i.e., same column names in different ``Data`` objects).
            By default, columns are renamed as '{column_name}_{data_name}'.
            If a ``data_name`` is included in ``innames``, the corresponding '{column_name}_{rename_name}' will be used instead.
            This can be used to avoid long column names.
            If subsets are kept (``keep_subsets=True``), conflicts in subset or group names will be handled in a similar manner.
            The default is {}.
        outname : str, optional
            The name of the merged data.
            If not given, this will be automatically generated from the names of data that are merged.
            The default is None.
        keep_subsets : bool, optional
            Whether the subsets of the data are kept and merged. The default is False.
        matchinfo_subset : bool, optional
            If ``keep_unmatched != []``, whether to add a subset 'matched/<this_data_name>/<name_of_data_matched_to_this_data>',
            indicating whether each row can be matched to that data.
            The default is False.
        verbose : bool, optional
            Whether to show more information on merging. The default is True.

        Returns
        -------
        matched_data : ``pyttop.table.Data``
            An ``pyttop.table.Data`` object containing the merged catalog.

        Notes
        -----
            If the ``keep_unmatched`` is not empty (``[]``), say ``keep_unmatched=['data1']``.
            Then, the rows in THIS data that has no match with the dataset called 'data1' are kept,
            and the columns from 'data1' for this row are missing values.

            'data1' may also have its subsets. When ``keep_subsets`` is set to True, the subsets of 'data1'
            are also merged. The rows with no match with 'data1' always do NOT belong to the subsets merged
            from 'data1'.
        '''
        # if keep_unmatched:
        #     raise NotImplementedError('Until a bug is fixed, the "keep_unmatched" feature is disabled.')
        #     # say we have the matching: cat1 <- cat2 <- cat3. If we do not require match with cat2, and
        #     # cat3 may be directly matched to cat1, then some rows with cat1, cat3 but not cat2 are missing.

        if type(keep_unmatched) is str:
            keep_unmatched = [keep_unmatched]
        elif keep_unmatched is True:
            keep_unmatched, _, _ = self._print_match_tree(self._match_tree(depth=depth)[0], silent=True)
            keep_unmatched = keep_unmatched[1:]
            if verbose: print(f'[merge] `keep_unmatched` set to all data matched to {self}: {keep_unmatched}')

        ## prepare variables
        matched = np.full((len(self),), True) # whether a record in self is matched to ALL (except those in keep_unmatched) the child data
        data1_matched_tables = []
        unnamed_count = 0
        data_names = [self.name]
        data_metas = {self.name: self.meta}
        data_subset_groups = [] # list of cut subset groups for each data
        subsets_to_be_added = [] # will be used if matchinfo_subset

        ## merge matchinfo
        if verbose: self.match_tree(depth=depth, detail=False)
        merged_matchinfo = self.merge_matchinfo(depth=depth)

        if len(merged_matchinfo) == 0:
            warnings.warn(f'nothing is matched to {self}',
                          stacklevel=2)

        ## check keep_unmatched
        if self.name in keep_unmatched:
            raise ValueError(f"cannot include base data '{self.name}' in `keep_unmatched`")
        for matchinfo in merged_matchinfo:
            if matchinfo.data1.name in keep_unmatched and matchinfo.has_child:
                msg = (f"cannot include data '{matchinfo.data1.name}' in `keep_unmatched`, "
                f"because {matchinfo.has_child} is/are matched through the intermediary '{matchinfo.data1.name}'")
                raise MergeError(msg)

        ## get matched indices and handle metadata
        for matchinfo in merged_matchinfo:
            data1 = matchinfo.data1

            # update ``matched``
            if data1.name not in keep_unmatched:
                data1_matched = matchinfo.matched
                matched &= data1_matched # boolean array indicating whether a row of the base data is matched to ALL the child data

            # collect metadata of matched data and handle unnamed data
            if data1.name is None:
                unnamed_count += 1
                data_names.append(str(unnamed_count))
                data_metas[str(unnamed_count)] = data1.meta
            else:
                data_names.append(data1.name)
                data_metas[data1.name] = data1.meta

        if outname is None:
            # outname = 'match_' + '_'.join(data_names)
            outname = f'({data_names[0]}).MATCH({", ".join(data_names[1:])})'

        if unnamed_count > 0 and verbose:
            print(f'found no names for {unnamed_count} sets of data, automatically named with numbers.')

        # check possible mispellings of data names in keep_unmatched, merge_columns, ignore_columns
        for n in chain(keep_unmatched, merge_columns.keys(), ignore_columns.keys()):
            if n not in data_names:
                warnings.warn(f'No data named "{n}" is matched: did you mispell the name?',
                              stacklevel=2)

        ## cut data and subsets (if needed) ##
        # cut myself
        data = self.t[matched] # data is not self.t # even if `matched` is all True
        
        def resolve_regex(table, colname_list):
            outname = []
            for name in colname_list:
                if isinstance(name, str):
                    outname.append(name)
                elif isinstance(name, re.Pattern):
                    outname.extend(cn for cn in table.colnames if name.search(cn))
                else:
                    raise TypeError('Expected str or re.Pattern for column names in '
                                    f'merge_columns/ignore_columns, got {type(name)}')
            return outname
        
        # TODO: make Data object itself valid as, e.g., merge_columns keys. (e.g., `self in merge_columns`)
        def filter_columns(data, table):
            # filter columns in `table` based on the list of column names 
            # corresponding to `data` in dicts `merge_columns` and `ignore_columns`
            
            
            if data.name in merge_columns:
                keep_columns = merge_columns[data.name]
                keep_columns = resolve_regex(table, keep_columns)
                table.keep_columns(keep_columns)
            if data.name in ignore_columns:
                remove_columnes = ignore_columns[data.name]
                remove_columnes = resolve_regex(table, remove_columnes)
                table.remove_columns(remove_columnes)
        
        filter_columns(self, data)
        # if self.name in merge_columns:
        #     data.keep_columns(merge_columns[self.name])
        # if self.name in ignore_columns:
        #     data.remove_columns(ignore_columns[self.name])

        if keep_subsets:
            subset_groups = Data._cut_subset_groups(self.subset_groups, matched)
            data_subset_groups.append(subset_groups)

        # cut data matched to me
        for matchinfo in merged_matchinfo:
            data1 = matchinfo.data1
            idx = matchinfo.idx
            data1_matched = matchinfo.matched

            data1_table = data1.t.copy()
            
            filter_columns(data1, data1_table)
            # if data1.name in merge_columns:
            #     data1_table.keep_columns(merge_columns[data1.name])
            # if data1.name in ignore_columns:
            #     data1_table.remove_columns(ignore_columns[data1.name])

            if data1.name in keep_unmatched: # keep unmatched
                if verbose: print(f'[merge] entries with no match for {data1._short_name} is kept.')
                idx[~data1_matched] = 0 # TODO: DANGER: The masked "data" will be valid values, i.e. the value on the first row! These may emerge when using e.g. np.array, plt.hist2d (which uses np.histogram2d).
                data1_table = Table(data1_table, masked=True)
                data1_matched_table = data1_table[idx]
                # mask values for unmatched records in data1
                for c in data1_matched_table.columns:
                    miss_val = Data._decide_missing_value(data1_matched_table[c])
                    if miss_val is not None:
                        data1_matched_table[c][~data1_matched] = miss_val
                        data1_matched_table[c].fill_value = miss_val # this avoids cases where the numpy's defult fill_value is already present in the (unmasked) data
                    data1_matched_table[c].mask[~data1_matched]=True
                data1_matched_table = data1_matched_table[matched]

                if keep_subsets:
                    subset_groups = Data._cut_subset_groups(data1.subset_groups, idx)
                    subset_groups = Data._mask_subset_groups(subset_groups, ~data1_matched)
                    subset_groups = Data._cut_subset_groups(subset_groups, matched)

                if matchinfo_subset:
                    subsets_to_be_added.append(Subset(
                        data1_matched[matched],
                        name=data1.name,
                        expression=f"<'{data1.name}' matched when merging to '{self.name}'>",
                        label=f"'{data1.name}' matched when merging to '{self.name}'"))

            else: # do not keep unmatched
                data1_matched_table = data1_table[idx[matched]]

                if keep_subsets:
                    subset_groups = Data._cut_subset_groups(data1.subset_groups, idx[matched])

            data1_matched_tables.append(data1_matched_table)

            if keep_subsets:
                data_subset_groups.append(subset_groups)

        # merge table and get data
        tables_to_be_matched = [data] + data1_matched_tables
        for t in tables_to_be_matched:
            t.meta.clear() # handle meta by myself, not by astropy

        assert find_dup(data_names).size == 0
        data_renames = [innames[n] if n in innames else n for n in data_names]
        if find_dup(data_renames).size > 0:
            msg = f'duplication in names caused by `innames`: {find_dup(data_renames)}'
            raise ValueError(msg)
        matched_table = hstack(tables_to_be_matched, table_names=data_renames)
        matched_data = Data(matched_table, name=outname)
        matched_data.meta['path'] = '(merged data)'

        # merge subsets
        if keep_subsets:
            # data_names changed to data_renames in v0.4.3
            merged_subset_groups = self.__class__._merge_subset_groups(data_subset_groups, data_renames, verbose=verbose)
            for groupname, group in merged_subset_groups.items():
                for subsetname, subset in group.items():
                    subset._data = matched_data
            matched_data.subset_groups = merged_subset_groups

        # add subsets if matchinfo_subset
        if matchinfo_subset:
            group = f'matched/{self.name}'
            if group in matched_data.subset_groups:
                raise RuntimeError(f"A subset group named '{group}' already exists.")
            matched_data.add_subsets(*subsets_to_be_added, group=group)

        if verbose: print('[merge] merged: ' + ', '.join(data_names))

        ## generate data meta
        assert list(data_metas.keys()) == data_names
        matched_names, _, tree_str = self._print_match_tree(self._match_tree(depth=depth)[0], silent=True)
        assert data_names == matched_names
        merging = OrderedDict({ # detailed information for merging
            'notes': 'This is a table merged from several tables. The merging information is recorded below. '\
                     'The metadata for merged datasets are recorded in "metas".',
            'options': dict(
                depth=depth,
                keep_unmatched=keep_unmatched,
                keep_subsets=keep_subsets,
                matchinfo_subset=matchinfo_subset,
                ),
            'tree': '\n' + tree_str, # the match tree of the base data
            'merged': data_names, # names of the data merged
            'metas': data_metas, # metas for the data merged
            })
        matched_data.meta['merging'] = merging

        return matched_data



[docs]
    def match_merge(self, data1, matcher, keep_unmatched=[], merge_columns={}, ignore_columns={}, outname=None, verbose=True):
        '''
        Match this data with ``data1`` and immediately merge everything that can be matched to this data.
        See :meth:`~Data.match` and :meth:`~Data.merge` for more information.
        '''
        self.match(data1=data1, matcher=matcher, verbose=verbose)
        return self.merge(keep_unmatched=keep_unmatched, merge_columns=merge_columns, ignore_columns=ignore_columns, outname=outname, verbose=verbose)


    def _match_tree(self, depth=-1, matcher='base', datas=None, removed_datas=None):
        # matcher: how I am matched to my parent
        # tree: the matched datas, in tree form
        # datas: the matched datas

        # initialize vars if not given
        tree = OrderedDict()
        if datas is None:
            datas = OrderedDict()
        if removed_datas is None: # this is generally useless now
            removed_datas = OrderedDict()

        # add myself to the tree
        if any(self is i for i in datas) and depth <= datas[self]['depth']:
            # we have already seen this data before and do not use myself here to match
            tree[self] = dict(
                name = self.name,
                depth = depth,
                matcher = matcher,
                merge = False,
                child = OrderedDict(),
                )
        else: # we have not seen this data before OR find a shallower match here
            if any(self is i for i in datas) and depth > datas[self]['depth']: # remove the existing match
                datas[self]['dict'][self]['child'].clear()
                datas[self]['dict'][self]['merge'] = False
                removed_datas[datas.pop(self)['depth']] = self # depth: data to remove

            # add myself, use myself here to match
            datas[self] = {'depth': depth, 'dict': tree} # 'dict' is the dict containing it
            tree[self] = dict(
                name = self.name,
                depth = depth,
                matcher = matcher,
                merge = True,
                child = OrderedDict(),
                )
            if depth != 0: # add my children
                for info in self.matchinfo:
                    data = info.data1
                    matcher = info.matcher
                    data_tree, datas, removed_datas = data._match_tree(depth=depth-1, matcher=matcher, datas=datas, removed_datas=removed_datas)
                    assert not any(data is i for i in tree[self]['child'])
                    tree[self]['child'].update(data_tree)

        return tree, datas, removed_datas

    def _print_match_tree(self, tree, detail=True, indent='', tree_str='', silent=False):
        matched_names = []
        matched_ids = []
        # print match tree given tree returned by self._match_tree
        for data, info in tree.items():
            # print data
            if info['merge']: #id(data) not in matched_ids:
                matched_names.append(data.name)
                matched_ids.append(id(data))
            matcher = '' if not detail else f' [{info["matcher"]}]'
            name = 'Unnamed' if info['name'] is None else info['name']
            name = '(' + name + ')' if not info['merge'] else name
            print_str = f'{indent}{name}{matcher}'
            if not silent: print(print_str)
            tree_str += print_str + '\n'
            added_matched_names, added_matched_ids, tree_str = self._print_match_tree(info['child'], detail=detail, indent=indent+':   ', tree_str=tree_str, silent=silent)
            matched_ids += added_matched_ids
            matched_names += added_matched_names

        # just to be compatible with old _match_tree
        return matched_names, matched_ids, tree_str


    def _old_match_tree(self, depth=-1, detail=True, matched_names=[], matched_ids=[], indent='', matcher='base', tree_str='', silent=False):
        # copy lists to avoid modifying it in-place (which will cause the method to "remember" them!)
        matched_names = matched_names.copy()
        matched_ids = matched_ids.copy()

        # print this name
        matcher = '' if not detail else f' [{matcher}]'
        name = 'Unnamed' if self.name is None else self.name
        if id(self) in matched_ids: # this data is already matched
            print_str = f'{indent}({name}){matcher}'
            if not silent: print(print_str)
            tree_str += print_str + '\n'
            return matched_names, matched_ids, tree_str # do not expand this anymore
        else:
            matched_names.append(name)
            matched_ids.append(id(self))
            print_str = f'{indent}{name}{matcher}'
            if not silent: print(print_str)
            tree_str += print_str + '\n'

        # print data matched to this
        if depth != 0:
            for info in self.matchinfo:
                data = info.data1
                matcher = info.matcher
                matched_names, matched_ids, tree_str = data._old_match_tree(depth=depth-1, detail=detail, matched_names=matched_names, matched_ids=matched_ids, indent=indent+':   ', matcher=matcher, tree_str=tree_str, silent=silent)
        return matched_names, matched_ids, tree_str


[docs]
    def match_tree(self, depth=-1, detail=True):
        '''
        Print a "match tree", showing all data that can be matched and merged to this data.

        The data that are directly matched to this data are called "chilren" of this data, and are on "depth 1".
        The data directly matched to data on "depth 1" are on "depth 2", etc.

        Parameters
        ----------
        depth : int, optional
            The depth.
            For example, if depth == 1, only the direct children (without grandchildren) of
            this data are shown.
            if depth == -1, all children (including all grandchildren) are shown.
            The default is -1.
        detail : bool, optional
            Whether to show detail (including how the data are matched). The default is True.

        '''
        print('Names with parentheses are already matched, thus they are not expanded and will be ignored when merging.')
        print('---------------')
        tree, _, _ = self._match_tree(depth=depth)
        self._print_match_tree(tree, detail=detail)
        # self._old_match_tree(depth=depth, detail=detail)
        print('---------------')


    #### operation


[docs]
    def apply(self, func, processes=None, args=(), progress_bar=False, **kwargs):
        '''
        Apply function ``func`` to each row of the Table (``data.t``) to get a new column.
        This operation is not vectorized.

        Parameters
        ----------
        func : function
            A function to be applied to each row. Example::

                >>> def func(row): # row is a row of the Table.
                ...     return row['a'] + row['b']

            Note that if processes is not None, func should be a global function
            and should not be a lambda function and only accepts one single argument ``row``.
        processes : None or int
            if int (>0) is given, this specifies the number of processes used to get the results.
            if -1 is given, will automatically use all available cpu cores.
            if None, multiprocessing will not be enabled.
            The default is None.
        args : Iterable, optional
            Additional arguments to be passed to func.
            The default is ().
        progress_bar : bool, optional
            Only relevant when multiprocessing is not enabled. If set to True, a progress bar will be shown.
            The default is False.
        **kwargs :
            Additional keyword arguments to be passed to func (not supported for multiprocessing).

        Returns
        -------
        list
            Result of applying ``func`` to each row.
        '''
        if processes is None:
            result = []
            if progress_bar and has_tqdm:
                rows = tqdm(self.t, total=len(self))
            else:
                rows = self.t
            for row in rows:
                result.append(func(row, *args, **kwargs))
        elif type(processes) is int:
            if kwargs:
                raise TypeError('passing kwargs is not supported for multiprocessing')
            if processes == -1:
                processes = None
            with mp.Pool(processes) as pool:
                if args != (): # additional arguments are passed: use starmap
                    if not isinstance(args, Iterable):
                        args = (args,)
                    func_args = ((row, *args) for row in self.t)
                    result = pool.starmap(func, func_args)
                else:
                    result = pool.map(func, self.t)
        else:
            raise TypeError('"processes" should be None or int.')
        # return Column(result)
        return result


    def _get_colnames_variable(self):
        '''
        Get the colnames that can be regarded as names,
        and do not have duplicates (no other column has the same name).
        '''
        colname_counts = Counter(self.colnames)
        self.colnames_as_variables = []
        for colname, count in colname_counts.items():
            if count == 1 and colname.isidentifier() and not iskeyword(colname):
                self.colnames_as_variables.append(colname)
        return self.colnames_as_variables


[docs]
    def eval(self, expression, to_col=None, **kwargs):
        '''
        Evaluate the value with an expression.

        In the expression, the columns of the table can be referred to with:

        - The name of the column, if the name can be regarded as a Python variable name,
          and they do not coincidence with names in the local/global namespace.
        - ``$(<column name>)``.
        - ``self['<column name>']``.

        The Data object itself can be referred to as ``self``.

        Parameters
        ----------
        expression : str
            The expression to be evaluated.
        to_col : str, optional
            Sets ``data[to_col]`` to the evaluated values of the expression.
            This is preferred to using ``data['name'] = data.eval(...)``,
            because the information of the expression is added to the metadata with ``data.eval(..., to_col='name')``.
            The default is None.
        **kwargs :
            If the expression uses some name that is not recognized (e.g. using a user-defined name will result in NameError),
            you can pass the values of the names here.

            For example, if you use an expression 'my_function(col) + my_value' (where 'col' is a column name in the data),
            you can pass ``my_function`` and ``my_value`` by::

                Data.eval('my_function(col) + my_value', my_function=my_function, my_value=my_value)

        Returns
        -------
        result :
            The result of the evaluation.
        '''
        localvars = locals().copy()
        localvars.update(**kwargs)
        self._get_colnames_variable()
        _existing_names = []
        for _colname in self.colnames_as_variables:
            if _colname not in localvars and _colname not in globals():
                # if a column name is not occupied by an existing name,
                # add it to local namespace
                localvars[_colname] = self[_colname]
            elif _colname in expression and f"$({_colname})" not in expression:
                # the expression seems to include this column name, but not in $(column name) format
                _existing_names.append(_colname)
        if _existing_names:
            warnings.warn("Column names ['{}'] coincidence with existing names in the local/global namespace, ".format("', '".join(_existing_names)) + \
                          'thus are not interpretated as column names. '\
                          "Consider refering column names with $(column name).")
        _eval_expression = re.sub(
            r"\$\((.*?)\)",  # replace $(...)
            lambda match: f"self['{match.group(1)}']", # to self['...']
            expression,
            )
        try:
            result = eval(_eval_expression, globals(), localvars)
        except SyntaxError as e:
            msg = f"'{expression}': invalid syntax (are you trying to directly refer to unsupported column names?)"
            raise SyntaxError(msg) from e
        except NameError as e:
            msg = f"'{expression}': Unrecognized name '{e.name}'. Check if you have misspelled a column name. If you are using a name defined in your script, consider passing '{e.name}={e.name}' when calling eval()."
            raise NameError(msg) from e

        if to_col is not None:
            self[to_col] = result
            self[to_col].meta['src_detail'] += f' with expr {expression!r}'
        return result



[docs]
    def mask_missing(self, cols=None, missval=None, verbose=True):
        '''
        Mask missing values represented by ``missval`` (e.g. -999)
        for columns ``cols``.

        For example, ``data.mask_missing(cols='col', missval=-999)``
        masks all -999 values in column "col", indicating that they are missing.

        If verbose, the information for the process will be printed.
        Note that the printed information indicates the number of elements masked
        in this process, rather than the total number of masked elements in the columns.
        To get the number of unmasked elements in a column, try::

            print(data.get_subsets('$unmasked/<column_name>'))

        Parameters
        ----------
        cols : str or list of str, optional
            Name(s) of the columns to be masked. The default is all columns.
        missval : optional
            The value regared as missing value. The default is NaN.
        verbose : bool, optional
            If verbose, the information for missing values will be printed.
            The default is True.
        '''
        if cols is None:
            cols = self.colnames
        if isinstance(cols, str):
            cols = (cols,)
        if missval is None:
            missval = np.nan

        if not self.t.masked:
            self.t = Table(self.t, masked=True, copy=False)  # convert to masked table

        def isnan(value):
            try:
                return np.isnan(value)
            except TypeError:
                return False

        for col in cols:
            if isnan(missval):
                mask = np.isnan(self.t[col])
                n_miss = np.sum(mask)
            else:
                mask = self.t[col] == missval
                n_miss = np.sum(mask)
            self.t[col].mask[mask] = True
            if verbose:
                n = len(self)
                print(f"[mask missing] col '{col}': {n_miss}/{n} ({n_miss/n*100:.2f}%) masked (value: {missval}).")



[docs]
    def check_duplication(self, *cols, action='print'):
        '''
        Check for duplicates for given columns

        Parameters
        ----------
        *cols : str
            The names of columns (if not given, all columns will be checked).
        action : str, optional
            What to do after checking. The valid actions are:

            - 'print': print the results
            - 'bool': return whether duplicates are found
            - 'detail': return a dict containing the duplicate values for columns with duplicates
            - 'subset': return a row subset including those where duplicates are found

            The default is 'print'.
        '''
        if len(cols) == 0:
            cols = self.colnames
        dups = {}
        for col in cols:
            vals = self[col]
            dup_vals = find_dup(vals)
            if isinstance(dup_vals, Column):
                dup_vals = dup_vals.data
            if dup_vals.size > 0:
                dups[str(col)] = dup_vals # str(col) supports input like ('col1', 'col2')

        if action == 'print':
            if dups: # duplicate found
                for col, dup_vals in dups.items():
                    print(f"Duplicates found for '{col}': {dup_vals}")
            else: # no duplicates
                print(f'No duplicates found for: {cols}')
        elif action == 'bool':
            return bool(dups)
        elif action in ['value', 'detail']:
            return dups
        elif action == 'subset':
            selection = np.full(len(self), False)
            for col in cols:
                unmasked = ~self[col].mask if np.ma.is_masked(self[col]) else True
                selection |= np.isin(self[col], dups[str(col)]) & unmasked
            subs = Subset(selection, name=f'dup_{"_".join(list(dups.keys()))}')
            subs.eval_(self)
            return subs
        else:
            raise ValueError(f"Unrecognized action '{action}'.")



[docs]
    def sort(self, keys, *, keep_subsets=False, kind=None, reverse=False):
        '''
        Returns a new ``Data`` instance with the table sorted according to one or more keys (columns).

        Unlike the ``sort()`` method of ``astropy.table.Table`` (i.e., ``data.t.sort()``),
        this method does not perform an in-place sort.

        Parameters
        ----------
        keys : str or list of str
            The column name(s) to order the table by.
        keep_subsets : bool, optional
            If ``True``, the subsets will be preserved. The default is ``False``.
        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
            Sorting algorithrm used by ``numpy.argsort``.
        reverse : bool, optional
            If ``True``, sort in reverse order. The default is ``False``.

        Returns
        -------
        ``pyttop.table.Data``
            A ``Data`` instance with the table sorted.

        '''
        # raise NotImplementedError('operation not supported yet; sort a table BEFORE converting it to a `Data` object')

        # create a new Data and avoid any in-place modifications
        table = self.t.copy()
        table.sort(keys, kind=kind, reverse=reverse)

        new_name = f'({self.name}).SORTED'
        sorted_data = Data(table, name=new_name)

        # handle meta
        sorted_data.meta.clear()
        sorted_data._path = f"(data '{self.name}' sorted)"
        sorted_data.meta.update({
            'path': sorted_data._path,
            'sort': OrderedDict({
                'keys': keys,
                'kind': kind,
                'reverse': reverse,
                }),
            'notes': f"The metadata for the orignal data '{self.name}' is recorded in 'meta'.",
            'meta': self.meta,
            })

        if keep_subsets:
            # handle subsets
            indexes = self.t.argsort(keys, kind=kind, reverse=reverse)
            sorted_data.subset_groups = self.__class__._cut_subset_groups(self.subset_groups, indexes, sorted_data)

        return sorted_data


        # ## in-place solution
        # # This operation should be discouraged, especially when any subset is defined.
        # # remove any matches
        # self.reset_match()

        # # operations on astropy Table
        # self.t.sort(keys, kind=kind, reverse=reverse)

        # # handle subsets
        # if not keep_subsets:
        #     self.clear_subsets()
        # else: # update subsets
        #     indexes = self.t.argsort(keys, kind=kind, reverse=reverse)
        #     # sort() is performed in-place rather than creating a new Data with the original Data intact.
        #     # Detach all old subset from this Data.
        #     # However, one **caveat** is that it can never handle those not recorded in this data.
        #     sorted_subset_groups = self.__class__._cut_subset_groups(self.subset_groups, indexes)

        #     for group in self.subset_groups:
        #         for subset in self.subset_groups[group]:
        #             self.subset_groups[group][subset]._data = None

        #     self.subset_groups = sorted_subset_groups


    #### metadata


[docs]
    def from_which(self, colname=None, detail=True):
        '''
        When reading a dataset from a file using ``Data(<path>, name=<name>)``,
        the name of the data is associated with each columns.
        After matching and merging it with other datasets, you may want to
        check the name of the data from which ``colname`` is matched.
        See examples below.

        **WARNING**: The information for user-added columns may be invalid.

        Parameters
        ----------
        colname : str, optional
            Column name.
            If this argument is not given, a dict with the information for all columns
            will be returned.
        detail : bool, optional
            Whether the detail of the data is returned. The default is True.

        Returns
        -------
        str or dict
            The name (str) of the data from which ``colname`` is matched,
            or a dict containing the information for all columns.

        Examples
        --------
        Say you have two catalog files, ``cat1.csv`` and ``cat2.csv``.

            >>> cat1 = Data('cat1.csv', name=cat1) # with columns 'col1', etc.
            >>> cat2 = Data('cat2.csv', name=cat2) # with columns 'col2', etc.
            >>> cat_merged = cat1.match(cat2, SkyMatcher()).merge()
            ... # cat_merged has columns 'col1', 'col2', etc.
            >>> cat_merged.from_which('col1')
            cat1 (loaded from "cat1.csv")
            >>> cat_merged.from_which('col2')
            cat2 (loaded from "cat2.csv")

        '''
        warnings.warn('WARNING: The information for user-added columns may be invalid.',
                      stacklevel=2)
        if colname is None:
            return OrderedDict((name, self.from_which(name, detail=detail)) for name in self.colnames)
        elif colname not in self.colnames:
            raise ColumnNotFoundError(colname)
        else:
            meta = self.t[colname].meta
            if 'src' in meta.keys():
                src, src_detail = meta['src'], meta['src_detail']
                info = src
                if detail:
                    info += f' ({src_detail})'
                return info
            else:
                return ''



[docs]
    def metaJson(self, save_path=None, yes=False):
        '''
        Generate a json string for the metadata of this Data.

        The metadata of a ``pyttop.table.Data`` object typically saves the information
        on how it was initialized, how it was merged (if it is a merged catalog), etc.
        It can be retrieved with ``data.meta``.
        This is saved as the metadata of ``data.t``, i.e. ``data.meta is data.t.meta``.

        Parameters
        ----------
        save_path : str, optional
            A path to save the json as a file. The default is None (do not save).
        yes : bool, optional
            If set to True, existing files will be overwritten without prompts. The default is False.

        Returns
        -------
        meta : str
            A json string.

        '''
        def break_lines(odict, key):
            odict[key] = odict[key].split('\n')

            # lines = odict[key]
            # assert isinstance(lines, str)
            # keys = list(odict.keys())
            # idx = keys.index(key)
            # after_keys = keys[idx+1:]
            # lines = lines.split('\n')
            # odict.pop(key)
            # for i, line in enumerate(lines):
            #     odict[f'{key}_l{i+1}'] =  line
            # for key in after_keys:
            #     odict.move_to_end(key)

        def prepare_meta(odict):
            if 'merging' in odict.keys():
                break_lines(odict['merging'], 'tree')
                for data in odict['merging']['metas']:
                    prepare_meta(odict['merging']['metas'][data])

        meta = deepcopy(self.meta)
        prepare_meta(meta)

        meta = json.dumps(meta, ensure_ascii=False, indent=4)
        if save_path:
            if os.path.exists(save_path) and not yes:
                pause_and_warn(f"file '{save_path}' already exists!",
                               choose='Proceed to overwrite this file?',
                               yes_message=f"file '{save_path}' overwritten.")
            with open(save_path, 'w', encoding='utf8') as f:
                f.write(meta)
        return meta


    def print_meta(self):
        print(self.metaJson())

    def clear_meta(self, in_place=False):
        raise NotImplementedError()

    #### subsets
    
    # The old '$unmasked/' is kept for backward compatibility;
    # it is parsed with the normal group/name/subset_name convention.
    _reserved_groupname_prefixes = ("$unmasked:", "$eval:")
    _special_subset_prefixes = _reserved_groupname_prefixes
    
    @staticmethod
    def _check_group_name(arg_name='group_name'):
        # check method's group_name argument:
        # should not start with prefix reserved for special subsets
        def decorator(meth):
            @wraps(meth)
            def newmeth(self, *args, **kwargs):
                bound_args = inspect.signature(meth).bind(self, *args, **kwargs)
                bound_args.apply_defaults()
                group_name = bound_args.arguments[arg_name]
                if isinstance(group_name, str):
                    # check prefixes
                    for prefix in self.__class__._reserved_groupname_prefixes:
                        if group_name.startswith(prefix):
                            raise ValueError(
                                f"group name '{group_name}' cannot start with reserved prefix '{prefix}'"
                                )
                elif group_name is None:
                    pass
                else:
                    raise TypeError(f'unsupported type for group name: {type(group_name)}')
                return meth(self, *args, **kwargs)
            return newmeth
        return decorator

    def _gen_subset_all(self):
        # this is called in __init__() and clear_subsets().
        # this means that it is re-evaluated when clearing subsets, in case len(self) changes.
        subset_all = Subset(np.ones(len(self)).astype(bool), name='all', expression='all', label='All') # subset named "all"
        # subset_all.data_name = self.name
        subset_all._data = self
        return subset_all


[docs]
    @_check_group_name('group')
    def add_subsets(self, *subsets, group=None, listalways=False, verbose=True):
        '''
        Add subsets to a subset group.

        A subset refers to a subset (selection) of rows;
        a subset group is a group of subsets.

        Beware that a subset does not "watch" the changes in the data:
        once added to the data, it never changes, even if the data changes.
        If you would like to update your subset, you may add it again to replace the old one.

        Parameters
        ----------
        *subsets : ``pyttop.table.Subset``
            The subsets to be added to this group.
            See :class:`Subset` for more information.
        group : str, optional
            The name of the subset group. If not specified, the default subset group will be used.
        listalways : bool, optional
            If True, always returns list of subsets (even if len(list) == 1).
            The default is False.
        verbose : bool, optional
            Whether or not information is printed on the screen.
            The default is True.

        Return
        ------
        subsets : tuple
            The arguments, i.e. a tuple of subset objects.

        '''
        # TODO: add arg "overwrite=False". if not overwrite, pass existing_names to Subset.eval_ to rename; otherwise, overwrite the subset
        if group is None:
            group = 'default'
        if group == '$unmasked':
            raise ValueError("Subset group '$unmasked' is a special group that cannot be modified.")
        elif group not in self.subset_groups.keys():
            self.subset_groups[group] = {} # create the new group

        # subset_objects = []
        subset_overwritten = []
        for subset in subsets:
            if not isinstance(subset, Subset):
                raise ValueError(f"expected pyttop.table.Subset, got {type(subset)}")
            subset.eval_(self, self.subset_groups[group].keys())
            name = subset.name
            if group == 'default' and name == 'all':
                raise ValueError("Subset name 'all' in the 'default' group is reserved and cannot be overwritten.")
            if name in self.subset_groups[group].keys():
                subset_overwritten.append(f'{group}/{name}')
            self.subset_groups[group][name] = subset
            # subset_objects.append(subset)
        # return subset_objects
        if verbose and subset_overwritten:
            print("[add_subsets] subset(s) at '{}' replaced".format("', '".join(subset_overwritten)))
        if not listalways and len(subsets) == 1:
            return subsets[0]
        else:
            return subsets



[docs]
    @_check_group_name('group_name')
    def subset_group_from_values(self, column, group_name=None, overwrite=False):
        '''
        Create a subset group by the unique values of a column.

        For example, if a column named "class" has 3 possible values, "A", "B" and "C",
        a subset group will be defined with 3 subsets for class=A, B, C, respectively.

        Parameters
        ----------
        column : str
            The name of the column.
        group_name : str, optional
            The name of the created subset group.
            The default is the name of the column.
        overwrite : bool, optional
            When a group with ``group_name`` already exists, whether to overwrite the group.
            The default is False.

        Raises
        ------
        ValueError
            A group with ``group_name`` already exists, and ``overwrite`` set to ``False``.
        '''
        if group_name is None:
            group_name = column

        if group_name == 'default':
            raise ValueError("name 'default' is not allowed for new subset groups")

        if group_name == '$unmasked':
            raise ValueError("Subset group '$unmasked' is a special group that cannot be modified.")

        if group_name in self.subset_groups and not overwrite:
            raise ValueError(f'A subset group with name "{group_name}" already exists.')

        col = self.t[column]
        if isinstance(col, np.ma.MaskedArray):
            col = col[~col.mask]
        values = np.unique(col)
        if len(values) > 10:
            warnings.warn(f'A total of {len(values)} unique values found in column "{column}". This will result in a subset group with a lot of subsets.')

        # TODO: use self.add_subsets (with overwrite=True) [make sure to test the changes before using it!]
        self.subset_groups[group_name] = {}
        for value in values:
            subset = Subset.by_value(column, value)
            subset.eval_(self)
            name = subset.name
            self.subset_groups[group_name][name] = subset



[docs]
    @_check_group_name('group_name')
    def subset_group_from_ranges(self, column, ranges, group_name=None, overwrite=False):
        '''
        Create a subset group by setting several ranges of values of a column.

        For example, ``data.subset_group_from_ranges(column='col1', ranges=[[0, 1], [1, 2]])``
        defines a subset group named ``'col1'``, which includes 2 subsets, ``0 < col1 < 1`` and ``1 < col1 < 2``.

        Parameters
        ----------
        column : str
            The name of the column.
        ranges : list of lists (or similar objects)
            List of ranges.
        group_name : str, optional
            The name of the created subset group.
            The default is the name of the column.
        overwrite : bool, optional
            When a group with ``group_name`` already exists, whether to overwrite the group.
            The default is False.

        Returns
        -------
        list
            A list of the created subsets.

        Raises
        ------
        ValueError
            A group with ``group_name`` already exists, and ``overwrite`` set to ``False``.
        '''
        if group_name is None:
            group_name = column

        if group_name == 'default':
            raise ValueError("name 'default' is not allowed for new subset groups")

        if group_name == '$unmasked':
            raise ValueError("Subset group '$unmasked' is a special group that cannot be modified.")

        if group_name in self.subset_groups and not overwrite:
            raise ValueError(f'A subset group with name "{group_name}" already exists.')

        # TODO: use self.add_subsets (with overwrite=True) [make sure to test the changes before using it!]
        self.subset_groups[group_name] = {}
        subsets = []
        for range_ in ranges:
            subset = Subset.by_range(**{column: range_})
            subset.eval_(self)
            name = subset.name
            self.subset_groups[group_name][name] = subset
            subsets.append(subset)
        
        return subsets



[docs]
    @_check_group_name('group')
    def clear_subsets(self, group=None):
        '''
        Clear user-defined subsets.

        Parameters
        ----------
        group : str, optional
            Name of the subset group to be cleared.
            If not specified, all user-defined subsets are deleted.
        '''
        # Note: this only removes the subset from the list; subset._data still unchanged

        if group in (None, 'all'): # clear all groups
            # print('INFO: subsets reset to default.')
            self.subset_groups = {
                'default': {'all': self._gen_subset_all()}
                }
        elif group in ('default',): # clear default group
            self.subset_groups['default'] = {'all': self._gen_subset_all()}
        elif group in self.subset_groups.keys(): # clear a certain group
            del self.subset_groups[group]
        elif group in ['$unmasked']: # trying to clear a special group
            raise ValueError(f"'{group}' is a special group that cannot be cleared")
        else:
            warnings.warn(f"group name '{group}' does not exist, no need to clear")



[docs]
    def get_subsets(self, path=None, name=None, group=None, listalways=False, force=False):
        '''
        Retrieve one or more subsets by specifying a group name, subset name(s),
        or path(s) formatted as ``'<group_name>/<subset_name>'``.

        If no arguments are provided, this method returns all subsets organized by group and subset names,
        accessible as a nested dictionary::

            >>> subsets = data.get_subsets()
            >>> mysubset = subsets['group_name']['subset_name']

        Note that a special subset is temporarily created when retrieving (or referring to) it.
        They can only be retrieved using the paths (e.g., ``'$unmasked:<column name>'``).
        Otherwise, a `GroupNotFoundError` will be raised.

        Parameters
        ----------
        path : str or list of str, optional
            The path or a list of paths.
            If provided, the ``name`` and ``group`` arguments are ignored.
            If a ``Subset`` object is given, that object itself is returned.
            The default is None.
        name : str or list of str, optional
            The names of subsets or a list of names.
            Defaults to all subsets in the specified group.
        group : str, optional
            The name of the group. Defaults to the default group.
        listalways : bool, optional
            If True, always returns a list of subsets (even if the list contains only one subset).
            The default is False.
        force : bool, optional
            Relevant only when ``path`` (or ``name``) is a ``Subset`` object.
            If this ``Subset`` object is not a subset of this data, an exception will be raised.
            Setting ``force`` to True will bypass this exception.
            Default is False.

        Returns
        -------
        ``pyttop.table.Subset`` or list of ``pyttop.table.Subset``
            The specified subset or list of subsets.

        Notes
        -----
        A special subset is a virtual subset that does not actually exist.
        It is used to create a (new) subset as if retrieving an existing subset from
        the data.
        These virtual subsets are only created when ``get_subsets()``
        is called and are not added to the data. To store a virtual subset as a "normal" subset in the
        ``pyttop.table.Data`` instance, use the following::

            data.add_subsets(
                data.get_subsets('<path to the special subset>'),
                )

        '''
        # TODO: add new special subsets
        ### OLD DOCSTRING ###
        # Recognized special subsets include:

        # - ``$unmasked``. This subset group contains virtual subsets indicating whether the values in
        #   a specified column are not masked (i.e., a subset in this group contains rows where the value
        #   for the specified column is not masked).
        #   To retrieve such a subset, use::

        #       data.get_subsets('$unmasked/<column name>')

        #   Note that a new subset is created each time ``get_subsets()`` is called to retrieve such a subset.
        #   The old subsets remain unchanged even if the column's mask changes. For example::

        #       subset0 = data.get_subsets('$unmasked/col1')
        #       # changing the mask of column 'col1'
        #       subset1 = data.get_subsets('$unmasked/col1')
        #       subset0 is not subset1 # True
        #       # NOTE: subset0 is the old subset that does not represent the masking of 'col1' now.

        # Examples
        # --------
        # Under construction

        # (i.e. a special subset group and the virtual subsets therein
        # is never remembered by a ``pyttop.table.Data`` instance)

        if path is None and name is None and group is None:
            return self.subsets()

        return self._get_subsets(path=path, name=name, group=group, listalways=listalways, force=force)


    def _get_subsets(self, path=None, name=None, group=None, listalways=False, force=False):
        # see user API get_subsets()
        autosearch = False # do not search in other groups unless the user inputs ONLY subsets
        if path is not None:
            if group is not None or name is not None:
                warnings.warn('Since the argument "path" is given, arguments "name"/"group" are ignored.',
                              stacklevel=2)
            if listalways and isinstance(path, (Subset, str)):
                path = [path]
            if isinstance(path, (Subset, str)):
                return self._get_subset_from_path(path, autosearch=autosearch, force=force)
            if isinstance(path, Iterable):
                subsets = []
                for p in path:
                    subsets.append(self._get_subset_from_path(p, autosearch=autosearch, force=force))
                return subsets
            else:
                raise TypeError(f'path should be str or Iterable, got {type(path)}')
        else:
            if group is None: # user inputs ONLY subsets
                autosearch = True
                group = 'default'
            if type(group) is not str:
                raise TypeError('group should be a string')

            if group not in self.subset_groups.keys():
                raise GroupNotFoundError(group)

            if name is None:
                name = self.subset_groups[group].keys()
            if listalways and isinstance(name, (Subset, str)):
                name = [name]
            if type(name) is str:
                return self._get_subset_from_path(f'{group}/{name}', autosearch=autosearch)
            if isinstance(name, Iterable):
                subsets = []
                for n in name:
                    if isinstance(n, Subset): # it is itself a Subset!
                        subsets.append(self._get_subset_from_path(n, autosearch=autosearch, force=force))
                    else:
                        subsets.append(self._get_subset_from_path(f'{group}/{n}', autosearch=autosearch))
                return subsets
            elif isinstance(name, Subset): # it is itself a Subset
                return self._get_subset_from_path(name, autosearch=autosearch, force=force)
            else:
                raise TypeError(f'name should be str or Iterable, not {type(name)}')

    @classmethod
    def _parse_subset_path(cls, path):
        # returns group, name given path
        
        # detects special subsets
        for prefix in cls._special_subset_prefixes:
            if path.startswith(prefix):
                return prefix, path[len(prefix):]
        
        # normal 'group/name/subset_name' convention
        if '/' not in path:
            group = 'default'
            name = path
        else:
            group, name = path.rsplit('/', maxsplit=1) # allows '/' in groupname
        
        return group, name

    def _get_subset_from_path(self, path, autosearch=False, force=False):
        # get the subset from path
        # autosearch: search this subset name in other groups if does not found this name in this group
        if isinstance(path, Subset): # it is itself a Subset
            self._check_subset_association(path, action = 'warn' if force else 'raise')
            return path

        group, name = self.__class__._parse_subset_path(path)

        if group in self.__class__._special_subset_prefixes: # a special subset
            return self._get_special_subset(group, name)

        if group in ['$unmasked']: # this is a special group
            return self._get_special_subset(group, name)

        if group not in self.subset_groups.keys():
            suggest_names = get_close_matches(group, self.subset_groups.keys())
            raise GroupNotFoundError(group, suggest_names=suggest_names)

        if name not in self.subset_groups[group].keys():
            if autosearch:
                subset = self._search_subset_from_name(name) # search for this name in all groups
            else:
                suggest_names = get_close_matches(name, self.subset_groups[group].keys())
                raise SubsetNotFoundError(f"{group}/{name}", kind='path', suggest_names=suggest_names)
        else:
            subset = self.subset_groups[group][name]
        return subset

    def _search_subset_from_name(self, name, verbose=True):
        # get subset for name without knowing the name of the group
        group = None
        subset = None
        for group_name, subsets in self.subset_groups.items():
            if name in subsets.keys():
                if subset is not None:
                    raise ValueError(f"subset name is ambiguous: '{name}' found in multiple groups")
                subset = subsets[name]
                group = group_name
        if subset is None:
            raise SubsetNotFoundError(name, kind='name')

        if verbose: print(f"[subset] Found subset '{name}' in group '{group}'.")
        return subset

    def _get_special_subset(self, group, name):
        if group in ['$unmasked:', '$unmasked']: # '$unmasked:<column_name>'
            if np.ma.is_masked(self[name]):
                unmasked = ~self[name].mask # here name is a column name
                # Notes: the '~' operation creates a new array. Consider this: a = np.array((True, True)); b = ~a; a[0] = not a[0]; print(np.all(b == ~a)) # False
            else:
                unmasked = np.full(len(self), True)
            subset = Subset(
                unmasked,
                name=f'$unmasked({name})',
                expression=f"~self['{name}'].mask",
                label=self.get_labels(name)+' unmasked'
                )
            subset.eval_(self)
            return subset
        elif group == '$eval:':
            subset = Subset(name)
            subset.eval_(self)
            return subset
        else:
            raise ValueError(f"unrecognized special subset: '{group}'")

    def _subset_associates(self, subset):
        if not isinstance(subset, Subset):
            raise TypeError('expected a Subset')
        # return subset in self or subset.data is self
        return subset.data is self

    def _check_subsets_consistency(self):
        consist_dict = {}
        for groupname, group in self.subset_groups.items():
            consist_dict[groupname] = {}
            for subsetname, subset in group.items():
                if subset.data is None:
                    consist_label = None
                elif subset.data is self:
                    consist_label = True
                else:
                    consist_label = False # however, this should not happen
                consist_dict[groupname][subsetname] = consist_label
        return consist_dict

    def _check_subset_association(self, subset, action='raise'):
        '''
        Checks if ``subset`` describes a subset of ``self``.
        This is done by checking if the data recorded (if any) by ``subset`` is ``self``.

        Note that the condition this method checks is weaker than requiring ``subset in self``.
        A subset can be no longer recognized by Data ``self`` (possibly because of
        replacing a subset at a certain path with a new subset);
        However, if this subset recognizes itself as a subset of ``self``,
        this method also returns True.

        Parameters
        ----------
        subset : Subset
            A Susbet.
        action : str, optional
            What to do if ``subset`` seems NOT to be associated with ``self``.
            If set to 'raise', an exception will be raised
            if ``subset`` seems not to be a subset of ``self``.
            If set to 'warn', a warning is generated.
            If set to 'quiet', no exception or warning will be generated.
            The default is 'raise'.
        '''
        # original version was checking if ``subset in self``
        # or the data recorded (if any) by ``subset`` is ``self``.
        if self._subset_associates(subset):
            return True
        else:
            msg = f'{subset} is not a subset of {self}'
            if action in [True, 'raise', 'error', 'exception']:
                raise TypeError(msg)
            elif action in ['warn', 'warning']:
                warnings.warn(msg, stacklevel=3)
            elif action in [False, 'quiet']:
                pass
            else:
                raise ValueError(f"unexpected action '{action}'")
            return False

    def _data_from_subset(self, subsets, minimal=False):
        '''
        Returns the sub-dataset from ``Subset`` objects

        Parameters
        ----------
        subsets : ``Subset`` or list of ``Subset``

        minimal : bool, optional
            If set to True, only minimal operations will be executed
            (i.e. metadata and subsets are not handled for the subset data).
            This is only expected to be true if the subset data is temporary
            (e.g. when used in ``Data.plot()``).
            The default is False.

        Returns
        -------
        ``Data``

        '''
        # force : bool, optional
        #     If set to True, the sub-dataset will be returned anyway even if
        #     the given subset it not associated with (i.e., describing a subset of)
        #     this data. The default is False.

        return_list = True
        if isinstance(subsets, Subset):
            subsets = [subsets]
            return_list = False

        subset_datas = []
        for subset in subsets:
            # if not force:
            #     self._check_subset_association(subset, action='raise')
                # and not self._subset_associates(subset):
                # raise ValueError(f'the provided {subset} is not a subset of {self}')

            index = np.array(subset)
            table_subset = self.t[index]
            new_name = f'({self.name}).SUBS({subset.name})'
            subset_data = Data(table_subset, name=new_name)

            if not minimal:
                # handle meta
                subset_data.meta.clear()
                subset_data._path = f"(data '{self.name}' cut by subset)"
                subset_data.meta.update({
                    'path': subset_data._path,
                    'subset': OrderedDict({
                        'name': subset.name,
                        'expression': subset.expression,
                        'label': subset.label,
                        'fraction': f'{subset.size}/{len(subset)}',
                        }),
                    'notes': f"The metadata for the orignal data '{self.name}' is recorded in 'meta'.",
                    'meta': self.meta,
                    })

                # handle subsets
                subset_data.subset_groups = Data._cut_subset_groups(self.subset_groups, index, subset_data)

            subset_datas.append(subset_data)
        if return_list:
            return subset_datas
        else:
            assert len(subset_datas) == 1
            return subset_datas[0]


[docs]
    def subset_data(self,
                    path=None, name=None, group=None,
                    expr=None, verbose=True, **kwargs):
        '''
        Get a subset (or several subsets) of data by specifying the subset(s)
        using the name(s) of subset group(s), subset(s), or the full path(s) (i.e. ``'<group_name>/<subset_name>'``).
        This is different from the ``get_subsets`` method, which returns the ``Subset`` objects.

        You may also pass a ``Subset`` object or a list of ``Subset`` objects to the ``path`` parameter,
        to directly get the data.

        For convenience, you can also directly specify an expression::

            data.subset_data(expr = 'col1 == 1')

        Which is equivalent to::

            data.subset_data(data.add_subsets(Subset(expr), group='temp'))

        This is similar to::

            data.t[data.t['col1'] == 1]

        but supports expressions and returns a Data.

        Parameters
        ----------
        path : Subset OR list of Subset OR str OR list of str, optional
            A Subset object or a list of Subset objects,
            or the path or a list of paths.
            If this is given, arguments ``name`` and ``group`` are ignored.
            The default is None.
        name : str or list of str, optional
            The names of subsets, or a list of names.
            The default is all subsets in the specified group.
        group : str, optional
            The name of the group.
            The default is the default group.
        expr : str, optional
            An expression that can be evaluated with ``Data.eval()`` (e.g. ``col1 == 1``).
            The default is None.
        verbose : bool, optional
            Whether print more information or not.
            The defualt is True.
        kwargs :
            Arguments passed to ``Subset()`` or ``Data.eval()``.

        Returns
        -------
        ``pyttop.table.Data`` or list of ``pyttop.table.Data``
            The subset of data or list of subsets of data specified.

        Examples
        --------


        '''
        if expr is not None:
            if any((s is not None for s in (path, name, group))):
                raise ValueError("Supply either an 'expr' argument "
                                 "or the 'path'/'name'/'group' arguments")
            subsets = self.add_subsets(Subset(expr, **kwargs), group='temp', verbose=verbose)
        else: # expr is None
            subsets = self._get_subsets(path=path, name=name, group=group)

        return self._data_from_subset(subsets)



[docs]
    def subset_summary(self, group=None):
        '''
        Get a summary table for the subsets and subset groups.

        The table consists of the following columns:

        - `group`: name of the subset group
        - `name`: name of the subset
        - `size`: size of the subset
        - `fraction`: fracion of the size to the total number
        - `expression`: expression/source code that specifies the selection of the subset
        - `label`: label of the subset used for plotting

        Parameters
        ----------
        group : str or list of str, optional
            The name (or list of names) of the subset group(s) to be shown in the table.
            If not given, all groups will be shown by default.

        Returns
        -------
        summary : ``astropy.table.Table``
        '''
        summary = Table(names=['group', 'name', 'size', 'fraction', 'expression', 'label'],
                        dtype=['str', 'str', 'int', 'float', 'str', 'str']
                        )
        if group is None: # no group specified
            # add information for special groups
            summary.add_row(dict(
                group='$unmasked', name='-',
                size='-1', fraction=np.nan,
                expression='<special subsets: item in col unmasked>',
                label='-',
                ))
            summary.add_row(dict(
                group='$eval', name='-',
                size='-1', fraction=np.nan,
                expression='<special subsets: rows satisfy expression>',
                label='-',
                ))
            # show all groups
            group = self.subset_groups.keys()
        elif type(group) is str:
            group = [group]
        shown_groups = {k: self.subset_groups[k] for k in group}
        for groupname, subsets in shown_groups.items():
            for subsetname, subset in subsets.items():
                n_selected = subset.size # same as np.sum(subset.selection)
                summary.add_row(dict(
                    group=groupname, name=subsetname,
                    size=n_selected, fraction=n_selected/len(subset.selection),
                    expression=subset.expression,
                    label=subset.label,
                    ))
        return summary


    # @property

[docs]
    def subsets(self):
        '''
        Retrieve subsets organized by group and subset names, accessible like a nested dictionary.

        Example
        -------
            >>> subsets = data.subsets()
            >>> mysubset = subsets['group_name']['subset_name']
        '''
        return SummaryDict(self.subset_groups, dict_name=f"subsets of Data '{self.name}'", element_names=['groups', 'subsets'], join_str=': ')


    #### plot


[docs]
    def set_labels(self, **kwargs):
        '''
        label(<column_name>=<label>)

        Add/update the labels used for, e.g., the labels on the axes of the plots.

        Example: if ``col1='$x_1$'``, the data in ``data.t['col1']`` will be labeled as '$x_1$' on the plots.

        Parameters
        ----------
        **kwargs : <column_name:str>=<label:str>
        '''
        self.col_labels.update(**kwargs)



[docs]
    def get_labels(self, *cols, listalways=False, eval=False):
        '''
        Get the labels of columns (if not set by ``set_labels``, the column name will be used).

        Parameters
        ----------
        *cols : str
            names of the columns
        listalways : bool, optional
            If True, always returns list of labels (even if len(list) == 1).
            The default is False.
        eval : bool, optional
            If True, column names that do not belong to this data will be considered as expressions
            that can be evaluated with ``Data.eval()``.
            The default is False.

        Returns
        -------
        str or list of str
        '''
        if not eval:
            labels = [self.col_labels[col] if col in self.col_labels else col for col in cols]
        else: # eval
            labels = []
            for col in cols:
                if col in self.col_labels:
                    labels.append(self.col_labels[col])
                elif col not in self.colnames:
                    labels.append(col.replace('$', '\\$')) #
                else:
                    labels.append(col)

        if len(labels) == 1 and not listalways:
            return labels[0]
        else:
            return labels


    # @property
    # def labels(self):
    #     return self.get_labels()

    # TODO. argument col_input: plan: make it possible to input something like 'col1', 'col2', c='col3',
    # and translate it to make it the same as columns=('col1', 'col2'), kwarg_columns={'c': 'col3'}.

[docs]
    @keyword_alias('deprecated', columns='cols', kwarg_columns='kwcols')
    @keyword_alias('accepted', group='groups')
    def plot(self, func, *args, col_input=None, cols=None, kwcols={}, eval=False, eval_kwargs={}, paths=None, subsets=None, groups=None, autolabel=True, ax=None, verbose=True, global_selection=None, title=None, iter_kwargs={}, **kwargs):
        '''
        Make a plot given a plot function.

        Arguments ``paths``, ``subsets``, ``groups`` are used to specify the subsets of data
        that are plotted in the same subplot.

        Parameters
        ----------
        func : str or Callable
            Function to make plots, e.g. ``plt.plot``,
            or name of the function, e.g. ``'plot'``.
        *args :
            Arguments to be passed to func.
        cols : str or list of str, optional
            The name of the columns to be passed to ``func``.
            For example, if ``cols = ['col1', 'col2']``, ``func`` will be called by::

                func(data['col1'], data['col2'], *args)

            `Note`: When ``autolabel`` is True, the len of this argument is used to guess the dimension of the plot (e.g. 2D/3D).
            The default is None.
        kwcols : dict, optional
            Names of data columns that are passed to ``func`` as keyword arguments.
            For example, if ``kwcols={'x': 'col1', 'y':'col2'}``, ``func`` will be called by::

                func(x=data['col1'], y=data['col2'])

        eval : bool, optional
            If set to ``True``, the names of data columns for ``cols`` and ``kwcols`` will be regarded as expressions to be evaluated with ``Data.eval()``.
            This means that you can not only input column names, but also input expressions. See :meth:`~Data.eval` for the syntax of expressions.
            Otherwise, the names will simply be considered as column names.
            The default is False.
        eval_kwargs : dict, optional
            Keyword arguments to be passed to ``Data.eval()`` when evaluating ``cols`` and ``kwcols``.
            Ignored if argument ``eval`` set to ``False``.
            The default is {}.
        paths : str or list of str, optional
            The full path of a subset (e.g. ``'<group_name>/<subset_name>'``) or a list of paths.
            If this is given, arguments ``subsets`` and ``group`` are ignored.
            The default is None.
        subsets : str or list of str, optional
            The names of subsets, or a list of names.
            The default is all subsets in the specified group.
        groups : str, optional
            The name of the group. The default is the default group.
        autolabel : bool, optional
            If True, will try to automatically add labels to the plot (made by ``func``) as well as the axes,
            using the labels stored in Data and Subset objects.

            NOTE: The labels for axes are auto-set according to the argument ``columns``,
            and may not get the results you expects.
            Label for axes and legends are only possible for axes if argument ``ax`` is given.

            The default is True.
        verbose : bool, optional
            Whether some detailed information is printed. The default is True.
        ax : axes, optional
            The axis to make the plot.
            The default is None.
        global_selection : ``astrodata.table.Subset`` or str or list of str, optional
            The global selection [or the path(s) of the selection(s)] for this plot.
            If not None, only data selected by this argument is plotted.
            Accepted input:

            - An ``pyttop.table.Subset`` object. Note that logical operations of subsets are supported, e.g. ``subset1 & subset2 | subset3``.
            - The path to the subset, i.e. ``'groupname/subsetname'``. If group name is 'default', you can directly use 'subsetname'.
            - A list/tuple/set of paths to the subsets. The global selection will be the logical AND (i.e. the intersection set) of the subsets.

            The default is None.
        title : str
            Manually setting the title of the plot. This will overwrite the title automatically generated.
            The default is None (automatically generated if autolabel is True).
        iter_kwargs : dict, optional
            Lists of keywoard arguments that are different for each subset specified.
            Suppose 3 subsets are specified using the ``subsets`` argument, an example value for
            ``iter_kwargs`` is ::

                {'color': ['b', 'r', 'k'], 'linestyle': ['-', '--', '-.']}

            The default is {}.
        **kwargs :
            Additional keyword arguments to be passed to ``func``.
        '''
        # Raises
        # ------
        # ValueError
        #     len of one item of iter_kwargs is not equal to
        #     the len of paths/subsets

        iter_kwargs = iter_kwargs.copy()
        kwarg_columns = kwcols.copy()
        columns = cols

        if type(columns) is str:
            columns = [columns]

        if type(func) is str:
            if func not in plot_funcs:
                raise ValueError("unrecognized func name '{}' (supported names: '{}')".format(func, "', '".join(plot_funcs.keys())))
            func = plot_funcs[func]
        else:
            func = plot.plotFuncAuto(func)

        if ax is None:
            ax = plt.gca()

        # if type(global_selection) in (str, tuple, list, set):
        if global_selection is not None:
            global_selection = bitwise_all(self._get_subsets(path=global_selection, listalways=True))

        subset_names = subsets
        subsets = self._get_subsets(path=paths, name=subset_names, group=groups, listalways=True)

        local_subsets = subsets
        if global_selection is not None:
            subsets = [(subset & global_selection) for subset in subsets]

        subset_data_list = self._data_from_subset(subsets, minimal=True)
        # subset_data_list = self.subset_data(path=paths, name=subset_names, group=groups)
        # if type(subset_data_list) is Data:
        #     subset_data_list = [subset_data_list]

        # try to automatically set label
        if autolabel:
            if 'label' not in iter_kwargs and 'label' not in kwargs: # label for single plot element
                # check if func supports label as input
                if isinstance(func, plot.PlotFunction):
                    func_params = func.func_sig.parameters
                    func_name = func.func_defname
                else:
                    func_params = inspect.signature(func).parameters
                    func_name = func.__name__
                if 'label' in func_params.keys() or any([param.kind == param.VAR_KEYWORD for param in func_params.values()]): # check if func supports label as argument
                    iter_kwargs['label'] = [subset.label for subset in local_subsets]
                else:
                    warnings.warn(f'Failed to automatically set labels: user-defined function "{func_name}" does not support "label" as argument.')

            if 'label' in kwargs and len(subset_data_list) > 1:
                warnings.warn('You are setting the same label for plots of multiple subsets.')

            # set axis label
            label_kwargs = func.config['ax_label_kwargs_generator']
            if ax is not None and columns is not None:
                ax.set(**label_kwargs(
                    self.get_labels(*columns, listalways=True, eval=eval),
                    ))

            # special case for my scatter()
            if type(func) == plot.PlotFunction and type(func.func) == plot.Scatter and 'c' in kwarg_columns and 'barlabel' not in kwargs:
                kwargs['barlabel'] = self.get_labels(kwarg_columns['c'], eval=eval)

        if iter_kwargs != {}:
            # check values
            for key, values in iter_kwargs.items():
                if not isinstance(values, Iterable):
                    values = [values]
                if len(values) != len(subset_data_list):
                    raise ValueError(f"len of iter_kwargs '{key}' should be {len(subset_data_list)}, got {len(values)}")
            # get kwargs for each subset
            iter_kwargs_list = [dict(zip(iter_kwargs.keys(), value)) for value in zip(*(iter_kwargs[i] for i in iter_kwargs))]
            # TODO: support specifying iter_kwargs using dict {subset_name: value}
        else:
            iter_kwargs_list = repeat({})

        plot_func = func.call_with_ax(ax) # callback of func should not be recursively called.
        # if isinstance(func, plot.PlotFunction):
        #     plot_func = func.call_with_ax(ax) # callback of func should not be recursively called.
        # else:
        #     plot_func = func

        if verbose and eval:
            # print expressions to be evaluated
            pass
            # TODO: NOT IMPLEMENTED
                        # if verbose and column not in self.colnames:
                        #     print(f"[plot] expression evaluated: '{column}'")

        for subset_data, iter_kwargs in zip(subset_data_list, iter_kwargs_list):
            if eval:
                # get_col = subset_data.eval
                get_col = lambda column: subset_data.eval(column, **eval_kwargs)
            else:
                get_col = subset_data.__getitem__

            if columns is None:
                input_data = () # input data for the func (as *args)
            else:
                input_data = []
                for column in columns:
                    input_data.append(get_col(column))
            this_kwarg_columns = {}
            for argname in kwarg_columns:
                argval = kwarg_columns[argname]
                if isinstance(argval, str):
                    this_kwarg_columns[argname] = get_col(argval)
                elif isinstance(argval, (list, tuple)) and all(isinstance(v, str) for v in argval):
                    this_kwarg_columns[argname] = [get_col(v) for v in argval]
                else:
                    raise TypeError(f'expected str or list/tuple of str for values of kwcols, got "{type(argval)}"')

            ret = plot_func(*input_data, *args, **this_kwarg_columns, **iter_kwargs, **kwargs)

        if hasattr(func, 'ax_callback'):
            func.ax_callback(ax) # call ax_callback attached to func only once

        if autolabel and ax is not None:
            if len(subset_data_list) > 1:
                legend = ax.legend()
                if len(legend.get_texts()) == 0: # no legend generated?
                    legend.remove()

                if global_selection is not None:
                    ax.set_title(global_selection.label)
            else: # only one data plot, just use title instead of legend
                ax.set_title(subsets[0].label)

        if title is not None:
            if ax is None:
                warnings.warn('To set the title, please input the axis, e.g. data.plot(<...>, ax=your_axis)')
            else:
                ax.set_title(title)

        return ret



[docs]
    @keyword_alias('deprecated', columns='cols', kwarg_columns='kwcols') # deprecated old names
    @keyword_alias('accepted', group='plotgroups', groups='plotgroups', paths='plotpaths', subsets='plotsubsets', ax='axes') # make plot() arguments acceptable here
    def plots(self, func, *args, cols=None, kwcols={}, eval=False, eval_kwargs={},
              plotpaths=None, plotsubsets=None, plotgroups=None,
              arraygroups=None, global_selection=None,
              share_ax=False, autobreak=False, autolabel=True, ax_callback=None,
              returns='fig', verbose=True,
              axes=None, fig=None,
              iter_kwargs={}, **kwargs):
        '''
        Make a plot given the function ``func`` used for plotting.

        If ``arraygroups`` is not ``None``, plot an "array" of subplots (panels; subplots with several rows and columns) for different selections given in ``arraygroups``;
        Each of the panels consists of several plots for different selections given in ``plotgroups``.
        This is useful if one wishes to compare a plot for different subsets of the data.
        For example, say ``plotgroups='group1'``, ``arraygroups=['group2', 'group3']``.
        Then each panel compares different subsets in ``'group1'``; different panels compares the results
        between subsets in ``'group2'`` and ``'group3'``.
        Note that the dataset for each plot in each panel is the INTERSECTION of the corresponding subsets in
        ``'group1'``, ``'group2'`` and ``'group3'``.

        Parameters
        ----------
        func : str or Callable or ``pyttop.plot.PlotFunction``
            Name of the ``matplotlib.pyplot`` function used to make plots, e.g. ``'plot'``, ``'scatter'``.

            Also accepts custum functions that receives an axis as the only argument,
            and returns a function (called "plotting function" hereafter) to make plots.
            Example:``lambda ax: ax.plot``.

            You can also input your custom plot function ``func`` defined by::

                from pyttop.plot import plotFunc
                @plotFunc
                def func(<your inputs>):
                    <make the plot>
                    return # you can return somthing here

            Or::

                from pyttop.plot import plotFuncAx
                @plotFuncAx
                def func(ax): # input ax axis
                    def plot(<your inputs>):
                        <make the plot>
                        return # you can return somthing here
                    return plot

        *args :
            Arguments to be passed to the plotting function.
        cols : str or list of str, optional
            The name of the columns to be passed to the plotting function.
            For example, if ``cols = ['col1', 'col2']``, the plotting function will be called by::

                func(data['col1'], data['col2'], *args)

            *Note*: When ``autolabel`` is True, the len of this argument is used to guess the dimension of the plot (e.g. 2D/3D).
            The default is None.
        kwcols : dict, optional
            Names of data columns that are passed to the plotting function as keyword arguments.
            For example, if ``kwcols={'x': 'col1', 'y': 'col2'}``, the plotting function will be called by::

                func(x=data['col1'], y=data['col2'])

        eval : bool, optional
            If set to ``True``, the names of data columns for ``cols`` and ``kwcols`` will be regarded as expressions to be evaluated with ``Data.eval()``.
            This means that you can not only input column names, but also input expressions. See :meth:`Data.eval` for the syntax of expressions.
            Otherwise, the names will simply be considered as column names.
            The default is False.
        eval_kwargs : dict, optional
            Keyword arguments to be passed to ``Data.eval()`` when evaluating ``cols`` and ``kwcols``.
            Ignored if argument ``eval`` set to ``False``.
            The default is {}.
        paths, subsets, groups :
            aliases of "plotpaths", "plotsubsets" and "plotgroups".
        plotpaths : str or list of str, optional
            The full path of a subset (e.g. ``'<group_name>/<subset_name>'``) or a list of paths, for plots in each subplot.
            If this is given, arguments ``plotsubsets`` and ``plotgroups`` are ignored.
            The default is None.
        plotsubsets : str or list of str, optional
            The names of subsets, or a list of names, for plots in each subplot.
            The default is all subsets in the specified group.
        plotgroups : str, optional
            The name of the subset group used to make different plots in each one of the panels.
            For example, when the plotting function plots curves and ``plotgroups`` consists of
            3 subsets, 3 curves for the 3 subsets are plotted in each of the panels.
            The default is None.
        arraygroups : str or iterable of len <= 2, optional
            The name of subset groups used to make different panels.
            Examples:

            - ``arraygroups = ['group1']``, where `'group1'` consists of 3 subsets.
              Then subplots with ``nrow=1, ncol=3`` (1x3) are generated.
            - ``arraygroups = ['group1', 'group2']``, where `'group1', 'group2'` consists of 3, 4 subsets respectively.
              Then subplots with ``nrow=3, ncol=4`` (3x4) are generated.

            The default is None.
        global_selection : ``pyttop.table.Subset`` or str or list of str, optional
            Only consider data in subset ``global_selection``.
            Accepted input:

            - An ``pyttop.table.Subset`` object. Note that logical operations of subsets are supported, e.g. ``subset1 & subset2 | subset3``.
            - The path to the subset, i.e. ``'groupname/subsetname'``. If group name is 'default', you can directly use 'subsetname'.
            - A list/tuple/set of paths to the subsets. The global selection will be the logical AND (i.e. the intersection set) of the subsets.

            The default is None (the whole dataset is considered).
        share_ax : bool, optional
            Whether the x, y axes are shared. The default is False.
        autobreak : bool, optional
            When ``arraygroups`` consists of only one group, whether to automatically break the row
            into several rows (since the default result is a group of subplots with only one row).
            The default is False.
        autolabel : bool, optional
            If True, will try to automatically add labels to the plot (made by ``func``) as well as the axes,
            using the labels stored in Data and Subset objects.

            NOTE: The labels for axes are auto-set according to the argument ``columns``,
            and may not get the results you expects.
            Label for axes and legends are only possible for axes if argument ``ax`` is given.

            The default is True.
        ax_callback : function, optional
            The function to be called as ``ax_callback(ax)`` after plotting in each panel,
            where ``ax`` is the axis object of this panel.
        returns : str, optional
            Decide what to return.

            - ``'fig'`` or ``'fig, axes'``:
                return figure and axes.
            - ``'plot'`` or ``'return'``:
                return a list of the returned values of the plot function.

            Whatever this argument is, you can always retrive the figure, axes and the returned values (of the
            plot function) of the last call of ``data.plot()`` with ``data.plot_fig, data.plot_axes, data.plot_returns``.

        verbose : bool, optional
            Whether some detailed information is printed. The default is True.
        ax :
            alias of "axes".
        axes : list of axes, optional
            The axes of the subplots.
            The default is None.
        fig : ``matplotlib.figure.Figure``, optional
            The figure on which the subplots are made.
            The default is None.
        iter_kwargs : dict, optional
            Lists of keywoard arguments that are different for each subset in ``plotgroups``.
            Suppose ``plotgroups='group1'`` consists of 3 subsets, an example value for
            ``iter_kwargs`` is ::

                {'color': ['b', 'r', 'k'], 'linestyle': ['-', '--', '-.']}

            The default is {}.
        **kwargs :
            Additional keyword arguments to be passed to the plotting function.

        Returns
        -------
        fig : ``matplotlib.figure.Figure``

        axes : ``matplotlib.axes.Axes`` or array of ``matplotlib.axes.Axes``

        '''
        # Raises
        # ------
        # ValueError

        #     - ``len(arraygroups) >=3``: plot array of dim >= 3 not supported.
        #     - inferred ``nrow*ncol`` != ``len(axes)`` given

        # TODO (not implemented)
        if share_ax: raise NotImplementedError('This feature is not implemented, and whether it will be added is undetermined.')

        self.plot_returns = []

        iter_kwargs = iter_kwargs.copy()
        kwarg_columns = kwcols.copy()
        columns = cols

        if type(func) is str:
            if func not in plot_array_funcs:
                raise ValueError(f'Supported func names are: {",".join(plot_array_funcs.keys())}')
            func = plot_array_funcs[func]
        else:
            func = plot.plotFuncAuto(func)

        # if type(global_selection) in (str, tuple, list, set):
        if global_selection is not None:
            global_selection = bitwise_all(self._get_subsets(path=global_selection, listalways=True))

        # special case for my scatter()
        if type(func) == plot.PlotFunction and type(func.func) == plot.Scatter and 'c' in kwarg_columns and 'barlabel' not in kwargs:
            kwargs['barlabel'] = self.get_labels(kwarg_columns['c'], eval=eval)

        if arraygroups is None:
            # only one axis
            if axes is None:
                axes = plt.gca()
            if isinstance(axes, Iterable):
                axes = axes[0]
            if fig is None:
                fig = axes.figure
            ret = self.plot(func, *args, cols=columns, kwcols=kwarg_columns, eval=eval, eval_kwargs=eval_kwargs, paths=plotpaths, subsets=plotsubsets, groups=plotgroups, autolabel=autolabel, global_selection=global_selection, verbose=verbose, ax=axes, iter_kwargs=iter_kwargs, **kwargs)
            if ax_callback is not None:
                ax_callback(axes)
            self.plot_returns.append(ret)

        else:
            # get subsets for each panel
            if type(arraygroups) is str:
                arraygroups = [arraygroups]
            subsets = [self._get_subsets(group=group, listalways=True) for group in arraygroups]
            # if global_selection is not None:
            #     subsets = [[subset & global_selection for subset in subseti] for subseti in subsets]
            if len(subsets) >= 3:
                raise ValueError('len(arraygroups) >=3: plot array of dim >= 3 not supported. ')
            elif len(subsets) == 2:
                subset_array = [[(xi & yi) for xi in subsets[0]] for yi in subsets[1]]
            else: # len(subsets) == 1
                subset_array = subsets

            # prepare and check consistency with axes
            nrow, ncol = len(subset_array), len(subset_array[0])
            if axes is None:
                if (autobreak
                    and (fig is None or not fig.axes)
                    and len(subsets) == 1):
                    # autobreak is not a comprehensive function
                    # decide nrow and ncol if autobreak
                    if len(subsets[0]) in subplot_arrange:
                        nrow, ncol = subplot_arrange[len(subsets[0])]
                    else:
                        pass # nrow, ncol = 1, len(subsets[0]) # TODO: not implemented

                if fig is None:
                    figsize = [6.4*(1+.7*(ncol-1)), 4.8*(1+.7*(nrow-1))]
                    fig = plt.figure(figsize=figsize)
                if fig.axes:
                    axes = fig.axes
                else: # fig is empty: create axes
                    axes = fig.subplots(nrow, ncol)

            else:
                if fig is None:
                    if isinstance(axes, Iterable):
                        fig = axes.ravel()[0].figure
                    else:
                        fig = axes.figure
                if not isinstance(axes, Iterable):
                    axes = [axes]
            axes = np.array(axes)
            axes_flat = axes.ravel()
            if len(axes_flat) != nrow*ncol:
                raise ValueError(f'Expected {nrow}*{ncol}={nrow*ncol} axes; got {len(axes)}.')

            # plot subplots
            for ax, subset in zip(axes_flat, chain(*subset_array)):
                if global_selection is not None:
                    subset_with_global = subset & global_selection
                else:
                    subset_with_global = subset
                ret = self.plot(func, *args, cols=columns, kwcols=kwarg_columns, eval=eval, eval_kwargs=eval_kwargs, paths=plotpaths, subsets=plotsubsets, groups=plotgroups, autolabel=autolabel, verbose=verbose, ax=ax, global_selection=subset_with_global, title=subset.label, iter_kwargs=iter_kwargs, **kwargs)
                self.plot_returns.append(ret)

                if ax_callback is not None:
                    ax_callback(ax)
            if autolabel and global_selection is not None:
                fig.suptitle(global_selection.label)

        self.plot_fig = fig
        self.plot_axes = axes

        if returns in ['fig', 'fig, axes']:
            return fig, axes
        elif returns in ['plot', 'return']:
            return self.plot_returns
        else:
            raise ValueError(f'Unrecognized input for returns: "{returns}"')


    #### IO

    # data when saving and loading "data" (zip) files.
    data_to_save = {
        # attribute name: save method
        'col_labels': 'json',
        'subset_groups': 'pkl',
        't': 'astropy.table',
        'name': 'txt',
        'matchlog': 'json',
        'meta': 'json',
        }
    table_format = 'fits' # 'ascii.ecsv' # 'fits' # 'asdf' # 'ascii.ecsv'
    table_ext = '.fits' # '.csv' # '.fits' # '.asdf' # '.ecsv'
    save_meta = dict( # all information for saving
        data_to_save=data_to_save,
        table_format=table_format,
        table_ext=table_ext,
        package_version=__version__,
        )

    # old values before save_meta is saved
    _old_data_to_save = {
        # attribute name: save method
        'col_labels': 'pkl',
        'subset_groups': 'pkl',
        't': 'astropy.table',
        'name': 'pkl',
        'matchlog': 'pkl',
        }
    _old_table_format = 'fits'
    _old_table_ext = '.fits'


[docs]
    def save(self, path, format='data', overwrite=False):
        '''
        Save data to file.

        Parameters
        ----------
        path : str
            Path to the file.
        format : str, optional
            The format of the file.
            The default is 'data'.
            Supported formats include:

            - 'pkl':
                Saving the full data object to a ``"*.pkl"`` file.
            - 'data' (default):
                Saving key data (including the data table, the subsets, etc.) to a ``"*.data"`` file.
                Note that the matching data is not saved.
            - Other formats: Any format supported by ``astropy.table.Table.write``.
                Only saving the data table (``astropy.table.Table``).
                This is equivalent to ``data.t.write(<...>)``.
        overwrite : bool, optional
            Whether to overwrite the file if it exists.
            If set to ``False``, a ``FileExistsError`` will be raised.
            The default is False.

        Raises
        ------
        FileExistsError
            The file already exists.

        Notes
        -----
        **Notes for developers**

        When setting ``format='pkl'``, a Data object will be saved with the standard ``pickle`` module.
        This means that all data for the object is converted and saved as a byte stream. When setting ``format='data'``,
        only a selected subset of attributes will be saved `separately`, and are not necessarily saved
        with the Python's standard pickling protocols. This makes it possible to retrieve some data from the ``'*.data'`` file
        even without e.g. Python's ``pickle`` module.
        '''
        if format == 'pkl':
            save_pickle(path, self, yes=overwrite)

        elif format == 'data': # save important data in a zip file
            if path[-5:] != '.data':
                path += '.data'
            if not overwrite and os.path.exists(path):
                raise FileExistsError(f'File "{path}" already exists. To overwrite, use the argument "overwrite=True".')
            with zipfile.ZipFile(path, mode='w', compression=zipfile.ZIP_DEFLATED) as datazip:
                # save data_to_save
                for attr, method in Data.data_to_save.items():
                    if method == 'astropy.table':
                        fname = attr + Data.table_ext
                        table = getattr(self, attr)
                        assert type(table) == Table
                        if Data.table_format.startswith('ascii.'): # uses astropy.io.ascii
                            # get the string
                            with io.StringIO() as sf:
                                table.write(sf, format=Data.table_format)
                                table_str = sf.getvalue()
                            table_str = table_str.encode()
                            with datazip.open(fname, mode='w') as f:
                                f.write(table_str)
                        else:
                            with datazip.open(fname, mode='w') as f:
                                table.write(f, format=Data.table_format) # ascii.ecsv
                    elif method == 'pkl':
                        fname = attr + '.pkl'
                        with datazip.open(fname, mode='w') as f:
                            pickle.dump(getattr(self, attr), f)
                    elif method == 'json':
                        fname = attr + '.json'
                        with datazip.open(fname, mode='w') as f:
                            json_str = json.dumps(getattr(self, attr), indent=4)
                            json_str = bytes(json_str, 'ascii')
                            f.write(json_str)
                    elif method == 'txt':
                        fname = attr + '.txt'
                        with datazip.open(fname, mode='w') as f:
                            s = getattr(self, attr)
                            assert isinstance(s, str), "'txt' mode only for str"
                            s = s.encode()
                            f.write(s)
                    else:
                        raise ValueError(f'unrecognized saving method: {method}')

                # save save_meta
                with datazip.open('.save_meta.json', mode='w') as f:
                    save_meta = self.__class__.save_meta
                    meta = json.dumps(save_meta, indent=4)
                    meta = bytes(meta, 'ascii')
                    f.write(meta)

                # save subset_data_consist
                with datazip.open('.subset_data_consist.json', mode='w') as f:
                    consist_dict = self._check_subsets_consistency()
                    subset_data_consist = json.dumps(consist_dict, indent=4)
                    subset_data_consist = bytes(subset_data_consist, 'ascii')
                    f.write(subset_data_consist)

        else:
            self.t.write(path, format=format, overwrite=overwrite)



[docs]
    @classmethod
    def load(cls, path, format='data', **kwargs):
        '''
        Load a data file saved with ``Data.save()`` (usually with ".data" or ".pkl" format).

        *Note*: You may also read a raw table file like ``'*.csv'``, but it
        is suggested to use ``Data('your_catalog.csv')`` instead of
        ``Data.load('your_catalog.csv', format='ascii.csv')``.

        Parameters
        ----------
        path : str
            Path to the file.
        format : str, optional
            The format of the file (see :meth:`Data.save`).
            The default is 'data'.
        **kwargs :
            other arguments passed when initializing ``Data``
            [Only used when format is neither 'data' nor 'pkl'.]

        Returns
        -------
        data : ``pyttop.table.Data``
        '''
        if format == 'data':
            attrs = {}
            try:
                with zipfile.ZipFile(path) as datazip:
                    if '.save_meta.json' in datazip.namelist():
                        with datazip.open('.save_meta.json') as f:
                            save_meta = json.load(f)
                        # locals().update(save_meta)
                        data_to_save, table_ext, table_format = save_meta['data_to_save'], save_meta['table_ext'], save_meta['table_format']
                    elif '.meta.json' in datazip.namelist(): # the old name
                        with datazip.open('.meta.json') as f:
                            save_meta = json.load(f)
                        data_to_save, table_ext, table_format = save_meta['data_to_save'], save_meta['table_ext'], save_meta['table_format']
                    else:
                        save_meta = None
                        data_to_save, table_ext, table_format = Data._old_data_to_save, Data._old_table_ext, Data._old_table_format

                    if '.subset_data_consist.json' in datazip.namelist():
                        with datazip.open('.subset_data_consist.json') as f:
                            subset_data_consist = json.load(f)
                    else:
                        subset_data_consist = None

                    for attr, method in data_to_save.items():
                        if method == 'astropy.table':
                            fname = attr + table_ext
                            with datazip.open(fname) as f:
                                attrs[attr] = Table.read(f, format=table_format, # ascii.ecsv
                                                         # unit_parse_strict='silent',
                                                         )
                        elif method == 'pkl':
                            fname = attr + '.pkl'
                            with datazip.open(fname) as f:
                                attrs[attr] = pickle.load(f)
                        elif method == 'json':
                            fname = attr + '.json'
                            with datazip.open(fname) as f:
                                attrs[attr] = json.load(f)
                        elif method == 'txt':
                            fname = attr + '.txt'
                            with datazip.open(fname) as f:
                                attrs[attr] = f.read().decode()
                        else:
                            raise ValueError(f'unrecognized saving method: {method}')
            except zipfile.BadZipFile as e:
                raise ValueError(f'The file is not a ".data" file generated by PyTTOP. Did you mean "Data(\'{path}\', <...>)"?') from e
            except KeyError as e:
                ver = f" ({save_meta['package_version']})" if save_meta and 'package_version' in save_meta else ''
                raise FailedToLoadError(f"Failed to load '{path}': is not a '.data' file or is saved with an older version{ver} of pyttop.") from e
            except:
                raise
            dataname = attrs['name'] if 'name' in attrs else None
            data = cls(attrs['t'], name=dataname)

            # initialize subsets
            if 'subset_groups' in attrs:
                subset_groups = attrs['subset_groups']
                if subset_data_consist is None: # for older versions
                    for groupname, group in subset_groups.items():
                        for subsetname, subset in group.items():
                            subset._data = data
                else:
                    for groupname, group in subset_groups.items():
                        for subsetname, subset in group.items():
                            consist = subset_data_consist[groupname][subsetname]
                            if consist is True:
                                subset._data = data
                            else: # False, None # for newer versions, this will not happen
                                raise SubsetError('unexpected inconsistency between subsets and data')

            update_names = [i for i in attrs if i not in ['name', 't']] # attrs that need to be updated
            for name in update_names:
                if name == 'meta': # a special case: can't set attribute 'meta'
                    # notes: the metadata is saved seperately in the data file, since some formats (e.g. fits) may not support certain features of the metadata.
                    getattr(data, name).clear()
                    getattr(data, name).update(attrs[name])
                else: # the normal cases
                    setattr(data, name, attrs[name])
            return data
        elif format == 'pkl':
            return load_pickle(path)
        else:
            return cls(path, format=format, **kwargs)


    #### basic methods

    def copy(self):
        raise NotImplementedError()

    @property
    def _short_name(self):
        if self.name is None:
            return None
        return omit_middle(self.name, config.display.data_name_maxlen)

    ## below are magic methods

    def __repr__(self):
        short_name = self._short_name
        namestr = f"'{short_name}'" if short_name is not None else 'without name'
        return f"<Data {namestr}>"

    def __len__(self):
        return len(self.t)

    def __getitem__(self, item):
        # warnings.warn('Although supported, it is not suggested to access table by directly subscripting Data objects. Use e.g. data.t[index] instead of data[index].')
        if isinstance(item, Subset):
            item = np.array(item) # see also: Data.subset_data()
        if np.ma.is_masked(item) and item.dtype == np.bool_:
            warnings.warn('got masked boolean array for item access: masked elements filled with False',
                          stacklevel=2)
            # Masked boolean array detected. Masked values will be filled with False and not retrieved in the resulting slice.
            item = item.filled(False) # This makes it similar to Subsets: "Masked elements do NOT belong to this subset"

        try:
            return self.t[item]
        except KeyError as e:
            key = e.args[0]
            suggest_names = get_close_matches(key, self.colnames)
            msg = f"'{key}'"
            if suggest_names:
                msg += " (did you mean: '{}')".format("', '".join(suggest_names))
            raise ColumnNotFoundError(msg) from e

        # return Data(self.t[item])

    def __setitem__(self, item, value):
        # only changes metadata when adding one new column;
        # see data.t's __setitem__ (astropy.table.table.Table.__setitem__)
        # TODO: better handle metadata (but maybe no need to change metadata for
        # changing existing columns; adding several new column seems to be unsupported)

        # warnings.warn('Although supported, it is not suggested to set items of the table by directly subscripting Data objects. Use e.g. data.t[index] instead of data[index].')
        if not isinstance(item, str):
            raise NotImplementedError('Currently, we only accept a str as the index. You may consider directly setting the values by `data.t[...] = ...` instead of `data[...] = ...` (WITH CAUTION).')

        new = isinstance(item, str) and item not in self.colnames # setting new column
        if new:
            was_empty = len(self) == 0 # This was an empty table
            self.t[item] = value
            self.t[item].meta['src'] = 'user-added'
            self.t[item].meta['src_detail'] = 'set by user'
            self.t[item].meta['set_by_user'] = True
            self.t[item].description = ''
            self.t[item].unit = ''
            if was_empty:
                self.clear_subsets() # reset subsets (so that subset 'all' is re-defined)
        else: # modifying existing column?
            # meta not modified; description and unit cleared
            old_meta = self.t[item].meta
            self.t[item] = value
            self.t[item].meta = old_meta
            self.t[item].description = ''
            self.t[item].unit = ''
            self.t[item].meta['src_detail'] += '; modified by user'
            self.t[item].meta['set_by_user'] = True

    def __contains__(self, item):
        '''
        Checks if this ``Data`` contains an ``item``.

        Parameters
        ----------
        item : ``Subset``
            Depending on the type of ``item``.

            ``Subset``:
                Returns ``True`` if ``item`` is recorded as one of the subsets of this ``Data``.
                (See also: ``Data._check_subset_association``)

        See also
        --------
        ``Data._check_subset_association``
        '''
        if isinstance(item, Subset):
            for groupname, group in self.subset_groups.items():
                for subsetname, subset in group.items():
                    if item is subset:
                        return True
            return False
        else:
            raise TypeError(f"unsupported type for 'in': {type(item)}")

    def _ipython_key_completions_(self):
        return self.colnames

    #### alternative names and abbreviations

    def df(self, index=None, use_nullable_int=True): # convenient method to get the pandas DataFrame from data
        return self.t.to_pandas(index=index, use_nullable_int=use_nullable_int)

    def ssdf(self, group=None, index=None, use_nullable_int=True):
        return self.subset_summary(group=group).to_pandas(index=index, use_nullable_int=use_nullable_int)

    @property
    def labels(self): # alternative name for col_labels
        return self.col_labels

    ## deprecated old names
    # subsets = subset_data # another name for subset_data
    # subplot_array = plots

[docs]
    @wraps(plots)
    def subplot_array(self, *args, **kwargs):
        return self.plots(*args, **kwargs)

    subplot_array.__doc__ = 'Deprecated name of :meth:`~Data.plots`'

    ## abbreviations for properties
    cols = colnames

    ## abbreviations for methods
    # tree = match_tree
    # mm = mskmis = mask_missing
    # chkdup = checkdup = check_duplication
    # adsub = add_subsets
    # gs = gtsub = get_subsets
    # subdat = subset_data
    # ss = subsum = subset_summary

    _method_aliases = {
        'match_tree': ['tree'],
        'mask_missing': ['mskmis', 'mm'],
        'check_duplication': ['checkdup', 'chkdup'],
        'add_subsets': ['adsub'],
        'get_subsets': ['gtsub', 'gs'],
        'subset_data': ['subdat'],
        'subset_summary': ['subsum', 'ss'],
        }


# create_method_alias(Data, Data._method_aliases)