''' Statistical methods used to define or modify position of glyphs. References: Wilkinson L. The Grammer of Graphics, sections 7, 7.1 Method Types: - Bin: Partitions a space before statistical calculation - Summary: Produces a single value comprising a statistical summary - Region: Produces two values bounding an interval. - Smooth: Produces values representing smoothed versions of the input data. - Link: Produces edges from pairs of nodes in a graph. ''' from __future__ import absolute_import import numpy as np import pandas as pd from six import string_types from bokeh.models.sources import ColumnDataSource from bokeh.core.has_props import HasProps from bokeh.core.properties import Bool, Date, Datetime, Either, Float, Instance, Int, List, String from .properties import Column, ColumnLabel, EitherColumn class Stat(HasProps): """Represents a statistical operation to summarize a column of data. Can be computed from either a ColumnLabel with a ColumnDataSource, *or*, a discrete column of data. """ # inputs column = ColumnLabel(help="""A column to use for the stat calculation. Required when providing a ColumnDataSource as input.""") source = Instance(ColumnDataSource, help="""One option for providing the data source for stat calculation.""") values = EitherColumn(Column(Float), Column(Int), Column(String), Column(Date), Column(Datetime), Column(Bool), default=None, help=""" Second option for providing values for stat calculation is by passing the actual column of data.""") # output value = Float(help="""The value calculated for the stat. Some stats could use multiple properties to provide the calculation if required.""") def __init__(self, **properties): source = properties.pop('source', None) if source is not None: if isinstance(source, pd.DataFrame): source = ColumnDataSource(source) properties['source'] = source super(Stat, self).__init__(**properties) self._refresh() def _refresh(self): """Lazy update of properties, used for initial transform init.""" if self.get_data() is not None: self.update() self.calculate() def set_data(self, data, column=None): """Set data properties and update all dependent properties.""" if isinstance(data, pd.DataFrame): data = ColumnDataSource(data) if isinstance(data, ColumnDataSource): self.source = data if column is not None: self.column = column else: self.values = data self.update() self.calculate() def get_data(self, column=None): """Returns the available columnlabel/source values or column values.""" if self.source is not None and (self.column is not None or column is not None): if column is not None: col = column else: col = self.column return pd.Series(self.source.data[col]) elif self.values is None and self.source is not None: return pd.Series(self.source.to_df().index) elif self.values is not None: return self.values else: return None def calculate(self): """Return transformed value from column label/source or column-like data.""" raise NotImplementedError('You must implement the calculate method ' 'for each stat type.') def update(self): """Perform any initial work before the actual calculation is performed.""" pass class Sum(Stat): def calculate(self): self.value = self.get_data().sum() class Mean(Stat): def calculate(self): self.value = self.get_data().mean() class Count(Stat): def calculate(self): self.value = self.get_data().count() class CountDistinct(Stat): def calculate(self): self.value = self.get_data().nunique() class Median(Stat): def calculate(self): self.value = self.get_data().median() class StdDeviation(Stat): def calculate(self): self.value = self.get_data().std() class Min(Stat): def calculate(self): self.value = self.get_data().min() class Max(Stat): def calculate(self): self.value = self.get_data().max() class Quantile(Stat): """Produces the cutpoint that divides the input data by the interval. Quartiles are a special case of quartiles that divide a dataset into four equal-size groups. (https://en.wikipedia.org/wiki/Quantile) """ interval = Float(default=0.5) def calculate(self): self.value = self.get_data().quantile(self.interval) class Bin(Stat): """Represents a single bin of data values and attributes of the bin.""" label = Either(String, List(String)) start = Either(Float, List(Float)) stop = Either(Float, List(Float)) start_label = String() stop_label = String() center = Either(Float, List(Float)) stat = Instance(Stat, default=Count()) width = Float() def __init__(self, bin_label, values=None, source=None, **properties): if isinstance(bin_label, tuple): bin_label = list(bin_label) elif isinstance(bin_label, string_types): bin_label = [bin_label] else: # This handles Pandas new Interval objects bin_label = [str(bin_label)] properties['label'] = bin_label bounds = self.process_bounds(bin_label) starts, stops = zip(*bounds) centers = [(start + stop)/2.0 for start, stop in zip(starts, stops)] if len(starts) == 1: starts = starts[0] stops = stops[0] centers = centers[0] else: starts = list(starts) stops = list(stops) centers = list(centers) properties['start'] = starts properties['stop'] = stops properties['center'] = centers properties['values'] = values super(Bin, self).__init__(**properties) @staticmethod def binstr_to_list(bins): """Produce a consistent display of a bin of data.""" value_chunks = bins.split(',') value_chunks = [val.replace('[', '').replace(']', '').replace('(', '').replace(')', '') for val in value_chunks] bin_values = [float(value) for value in value_chunks] return bin_values[0], bin_values[1] def process_bounds(self, bin_label): if isinstance(bin_label, list): return [self.binstr_to_list(dim) for dim in bin_label] else: return [self.binstr_to_list(bin_label)] def update(self): self.stat.set_data(self.values) def calculate(self): self.value = self.stat.value class BinStats(Stat): """A set of statistical calculations for binning values. Bin counts using: https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule """ bins = Either(Int, Float, List(Float), default=None, help=""" If bins is an int, it defines the number of equal-width bins in the given range. If bins is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. (default: None, use Freedman-Diaconis rule) """) bin_width = Float(default=None, help='Use Freedman-Diaconis rule if None.') q1 = Quantile(interval=0.25) q3 = Quantile(interval=0.75) labels = List(String) def __init__(self, values=None, column=None, **properties): properties['values'] = values properties['column'] = column or 'values' super(BinStats, self).__init__(**properties) def update(self): values = self.get_data() self.q1.set_data(values) self.q3.set_data(values) if self.bins is None: self.calc_num_bins(values) def calc_num_bins(self, values): """Calculate optimal number of bins using IQR. From: http://stats.stackexchange.com/questions/114490/optimal-bin-width-for-two-dimensional-histogram """ iqr = self.q3.value - self.q1.value if iqr == 0: self.bin_width = np.sqrt(values.size) else: self.bin_width = 2 * iqr * (len(values) ** -(1. / 3.)) self.bins = int(np.ceil((values.max() - values.min()) / self.bin_width)) if self.bins <= 1: self.bins = 3 def calculate(self): pass class BinnedStat(Stat): """ Base class for shared functionality accross bins and aggregates dimensions for plotting. """ bin_stat = Instance(BinStats, help=""" A mapping between each dimension and associated binning calculations. """) bins = List(Instance(Bin), help=""" A list of the `Bin` instances that were produced as result of the inputs. Iterating over `Bins` will iterate over this list. Each `Bin` can be inspected for metadata about the bin and the values associated with it. """) stat = Instance(Stat, default=Count(), help=""" The statistical operation to be used on the values in each bin. """) bin_column = String() centers_column = String() aggregate = Bool(default=True) bin_values = Bool(default=False) bin_width = Float() def __init__(self, values=None, column=None, bins=None, stat='count', source=None, **properties): if isinstance(stat, str): stat = stats[stat]() properties['column'] = column or 'vals' properties['stat'] = stat properties['values'] = values properties['source'] = source self._bins = bins super(BinnedStat, self).__init__(**properties) def _get_stat(self): stat_kwargs = {} if self.source is not None: stat_kwargs['source'] = self.source stat_kwargs['column'] = self.column elif self.values is not None: stat_kwargs['values'] = self.values stat_kwargs['bins'] = self._bins return BinStats(**stat_kwargs) def update(self): self.bin_stat = self._get_stat() self.bin_stat.update() class Bins(BinnedStat): """Bins and aggregates dimensions for plotting. Takes the inputs and produces a list of bins that can be iterated over and inspected for their metadata. The bins provide easy access to consistent labeling, bounds, and values. """ def calculate(self): bin_str = '_bin' self.bin_column = self.column + bin_str bin_models = [] data = self.bin_stat.get_data() bins = self.bin_stat.bins # Choose bin bounds when data range is ill-defined; pd.cut() # does not handle this well for values that are <= 0 if data.size < 2: raise ValueError('Histogram data must have at least two elements.') if data.ndim == 1 and data.std() == 0: margin = 0.01 * abs(float(data[0])) or 0.01 bins = np.linspace(data[0] - margin, data[0] + margin, bins+1) binned, bin_bounds = pd.cut(data, bins, retbins=True, include_lowest=True, precision=0) self.bin_width = np.round(bin_bounds[2] - bin_bounds[1], 1) if self.source is not None: # add bin column to data source self.source.add(binned.tolist(), name=self.bin_column) df = self.source.to_df() else: df = pd.DataFrame({self.column: self.values, self.bin_column: binned}) for name, group in df.groupby(self.bin_column): bin_models.append(Bin(bin_label=name, values=group[self.column], stat=self.stat)) self.bins = bin_models centers = binned.copy() centers = centers.astype(str) for bin in self.bins: centers[binned == bin.label] = bin.center self.centers_column = self.column + '_center' if self.source is not None: self.source.add(centers.tolist(), name=self.centers_column) else: df[self.centers_column] = centers def __getitem__(self, item): return self.bins[item] def apply(self, data): self.set_data(data.source) return self.source.to_df() def sort(self, ascending=True): if self.bins is not None: self.bins = list(sorted(self.bins, key=lambda x: x.center, reverse=~ascending)) class Histogram(BinnedStat): """Bins and aggregates dimensions for plotting. Takes the inputs and produces a list of bins that can be iterated over and inspected for their metadata. The bins provide easy access to consistent labeling, bounds, and values. """ density = Bool(False, help=""" Whether to normalize the histogram. If True, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. If False, the result will contain the number of samples in each bin. For more info check ``numpy.histogram`` function documentation. (default: False) """) def calculate(self): bin_str = '_bin' self.bin_column = self.column + bin_str data = self.bin_stat.get_data() bins = self.bin_stat.bins binned, bin_bounds = np.histogram( np.array(data), density=self.density, bins=bins ) self.bin_width = np.round(bin_bounds[2] - bin_bounds[1], 1) self.bins = [] for i, b in enumerate(binned): width = bin_bounds[i+1] - bin_bounds[i] if i == 0: lbl = "[%f, %f]" % (bin_bounds[i], bin_bounds[i+1]) else: lbl = "(%f, %f]" % (bin_bounds[i], bin_bounds[i+1]) self.bins.append(Bin(bin_label=lbl, values=[binned[i]], stat=Max(), width=width)) def bins(data, values=None, column=None, bins=None, labels=None, **kwargs): """Specify binning or bins to be used for column or values.""" if isinstance(data, str): column = data values = None else: column = None return Bins(values=values, column=column, bins=bins, **kwargs) stats = { 'sum': Sum, 'mean': Mean, 'count': Count, 'nunique': CountDistinct, 'median': Median, 'stddev': StdDeviation, 'min': Min, 'max': Max, 'quantile': Quantile }