Source code for pema.summary_plots

import matplotlib.pyplot as plt
from scipy.stats import beta
from scipy import stats
import numpy as np
from multihist import Hist1d
import strax
import multihist
from scipy.stats import norm
from immutabledict import immutabledict
from copy import deepcopy

export, __all__ = strax.exporter()

outcome_colors = {
    'found': 'darkblue',
    'chopped': 'mediumslateblue',

    'missed': 'red',
    'merged': 'turquoise',
    'split': 'purple',

    'misid_as_s2': 'orange',
    'misid_as_s1': 'goldenrod',
    'split_and_misid': 'darkorange',
    'merged_to_s2': 'chocolate',
    'merged_to_s1': 'sandybrown',
    'merged_to_unknown': 'khaki',

    'unclassified': 'green',
    'split_and_unclassified': 'seagreen',
    'merged_and_unclassified': 'limegreen',
}

DEFAULT_NPH_LABEL = r'$\#\mathrm{Photon\ detected\ (excl. DPE)}$'


[docs]@export
def peak_matching_histogram(results, histogram_key, bin_edges):
    """
    Make 1D histogram of peak matching results (=peaks with extra fields)
    added by histogram_key
    """

    if histogram_key not in results.dtype.names:
        raise ValueError(
            'Histogram key %s should be one of the columns in results: %s' % (
                histogram_key,
                results.dtype.names))

    # How many true peaks do we have in each bin in total?
    n_peaks_hist = Hist1d(results[histogram_key], bin_edges)
    hists = {'_total': n_peaks_hist}

    for outcome in np.unique(results['outcome']):
        # Histogram the # of peaks that have this outcome
        hist = Hist1d(results[results['outcome'] == outcome][histogram_key],
                      bins=n_peaks_hist.bin_edges)
        hists[outcome] = hist

    return hists


[docs]@export
def plot_peak_matching_histogram(*args, **kwargs):
    hists = peak_matching_histogram(*args, **kwargs)
    _plot_peak_matching_histogram(hists)


def _plot_peak_matching_histogram(hists):
    """
    Make 1D histogram of peak matching results (=peaks with extra fields)
    added by histogram_key
    """

    n_peaks_hist = hists['_total']

    for outcome, hist in hists.items():
        hist = hist.histogram.astype(float)

        if outcome == '_total':
            continue

        print("\t%0.2f%% %s" % (100 * hist.sum() / n_peaks_hist.n, outcome))

        # Compute Errors on estimate of a proportion
        # Should have vectorized this... lazy
        # Man this code is ugly!!!!
        limits_d = []
        limits_u = []
        for i, x in enumerate(hist):
            limit_d, limit_u = binom_interval(x,
                                              total=n_peaks_hist.histogram[i])
            limits_d.append(limit_d)
            limits_u.append(limit_u)
        limits_d = np.array(limits_d)
        limits_u = np.array(limits_u)

        # Convert hist to proportion
        hist /= n_peaks_hist.histogram.astype('float')

        color = outcome_colors.get(outcome, np.random.rand(3, ))
        plt.errorbar(x=n_peaks_hist.bin_centers,
                     y=hist,
                     yerr=[hist - limits_d, limits_u - hist],
                     label=outcome,
                     color=color,
                     linestyle='-' if outcome == 'found' else '',
                     drawstyle='steps-mid',
                     marker='s')

        # Wald intervals: not so good
        # errors = np.sqrt(
        #     hist*(1-hist)/all_true_peaks_histogram
        # )
        # plt.errorbar(x=bin_centers, y=hist, yerr = errors, label=outcome)

    plt.xlim(n_peaks_hist.bin_edges[0], n_peaks_hist.bin_edges[-1])
    plt.ylabel('Fraction of peaks')
    plt.ylim(0, 1)
    plt.legend(loc='lower right', shadow=True)
    legend = plt.legend(loc='best', prop={'size': 10})
    if legend and legend.get_frame():
        legend.get_frame().set_alpha(0.8)


[docs]@export
def binom_interval(success, total, conf_level=0.95):
    """
    Confidence interval on binomial - using Jeffreys interval
    Code stolen from https://gist.github.com/paulgb/6627336
    Agrees with http://statpages.info/confint.html for binom_interval(1, 10)
    """
    # Should we add a special case for success = 0 or = total? see wikipedia
    quantile = (1 - conf_level) / 2.
    lower = beta.ppf(quantile, success, total - success + 1)
    upper = beta.ppf(1 - quantile, success + 1, total - success)
    # If something went wrong with a limit calculation, report the trivial limit
    if np.isnan(lower):
        lower = 0
    if np.isnan(upper):
        upper = 1
    return lower, upper


def get_interval(x, n, found):
    one_sigma = stats.norm.cdf(1) - stats.norm.cdf(-1)
    eff = found / n

    limits = np.zeros((len(x), 2))
    for i, w in enumerate(x):
        limits[i, :] = binom_interval(found[i], total=n[i], conf_level=one_sigma)
    yerr = np.abs(limits.T - eff)
    return eff, yerr


def calc_arb_acceptance(data, on_axis, bin_edges, nbins=None, ) -> tuple:
    """Calculate acceptance on given axis"""
    if nbins is None:
        nbins = bin_edges[-1] - bin_edges[0]
    be = np.linspace(*bin_edges, nbins + 1)
    bin_centers = (be[1:] + be[:-1]) / 2
    total = np.zeros(len(bin_centers))
    found = np.zeros(len(bin_centers))

    data_remaining = data[[on_axis, 'acceptance_fraction']].copy()
    for bi in range(len(be) - 1):
        mask = (data_remaining[on_axis] >= be[bi]) & (data_remaining[on_axis] < be[bi + 1])
        total[bi] = len(data_remaining[mask])
        found[bi] = np.sum(data_remaining[mask]['acceptance_fraction'])
        data_remaining = data_remaining[~mask]

    values, yerr = get_interval(bin_centers, total, found)
    return bin_centers, values, yerr


def acceptance_plot(data, on_axis, bin_edges, nbins=None, plot_label=""):
    """
    Compute acceptance from data using acceptance_fraction
    (this is an arbitrary weighing of the acceptance based on the outcome of matching)
    """
    bin_centers, values, yerr = calc_arb_acceptance(data, on_axis, bin_edges, nbins)
    _plot_acc(bin_centers, values, yerr, plot_label)
    plt.xlabel(on_axis.replace('_', ' '))


def _plot_acc(bin_centers, values, yerr, plot_label):
    plt.errorbar(x=bin_centers,
                 y=values,
                 yerr=yerr,
                 linestyle='none',
                 marker='o',
                 markersize=4,
                 capsize=3,
                 label=plot_label,
                 )


def _plot_mh_percentile(mh: multihist.Histdd,
                        percentile: int,
                        **kwargs,
                        ):
    percentile_from_mh = mh.percentile(percentile, mh.axis_names[1])
    kwargs.setdefault('drawstyle', 'steps-mid', )
    plt.plot(percentile_from_mh.bin_centers,
             percentile_from_mh,
             **kwargs
             )


def rec_plot(dat,
             dpe_offset=0.0,
             show_hist=True,
             axis_names=('raw_area_trigger', 'rec_bias'),
             print_dpe=True,
             _percentiles_y_bins=500,
             _y_label_kwargs=immutabledict(fontsize=18),
             **kwargs):
    kwargs.setdefault('bins', 50)
    kwargs.setdefault('range', [[0, 50], [-1, 1]])
    x_ax, y_ax = axis_names

    mh_display = multihist.Histdd(axis_names=axis_names, **kwargs)

    # Make a second Multihist with the full y-range for reliable percentiles,
    # otherwise you might get wrong results since you cut away data from the
    # range kwargs.
    percentile_kwargs = deepcopy(kwargs)
    y_data = dat[y_ax] / (1 + dpe_offset) - 1
    percentile_kwargs['range'][1] = [y_data.min(), y_data.max()]
    if isinstance(kwargs['bins'], (list, tuple, np.ndarray)):
        percentile_kwargs['bins'] = [kwargs['bins'][0], _percentiles_y_bins]
    else:
        percentile_kwargs['bins'] = [kwargs['bins'], _percentiles_y_bins]
    mh_for_percentiles = multihist.Histdd(axis_names=axis_names, **percentile_kwargs)

    mh_display.add(dat[x_ax], y_data)
    mh_for_percentiles.add(dat[x_ax], y_data)

    one_sigma = 100 * (1 - norm.cdf(1))
    median = 50
    minus_one_sigma = 100 * (1 - norm.cdf(-1))
    _plot_mh_percentile(mh_for_percentiles, median,
                        color='whitesmoke', label='median')
    _plot_mh_percentile(mh_for_percentiles, one_sigma,
                        color='cyan', label=f'{one_sigma:.1f}% quantile')
    _plot_mh_percentile(mh_for_percentiles, minus_one_sigma,
                        color='green', label=f'{minus_one_sigma:.1f}% quantile')

    del mh_for_percentiles

    if show_hist:
        mh_display.plot(log_scale=True)
    a_rec = r'\frac{\mathrm{Area}_\mathrm{reconstructed}\mathrm{\ }[\mathrm{PE}]'
    dpe_div = r'\mathrm{\ }/\mathrm{\ }(1+p_{DPE})}'
    if print_dpe:
        dpe_div = dpe_div[:-1] + r'\mathrm{\ }|\mathrm{\ }p_{DPE}=' + f'{dpe_offset}' + '}'
    true_nph = DEFAULT_NPH_LABEL
    plt.xlabel(true_nph)
    y_label = r'$' + a_rec + dpe_div + r'{' + true_nph.replace('$', '') + r'}-1$'
    plt.ylabel(y_label, **_y_label_kwargs)
    plt.grid()


def _rec_kwargs(s1_kwargs=None,
                s2_kwargs=None):
    if s1_kwargs is None:
        s1_kwargs = {}
    if s2_kwargs is None:
        s2_kwargs = {}
    return s1_kwargs, s2_kwargs


def _rec_diff_inner(dat, title, **kwargs):
    if not len(dat):
        return
    rec_plot(dat, **kwargs)
    plt.axhline(0, linestyle='--', c='k')
    plt.title(title)


def reconstruction_bias(data, **kwargs):
    rec_diff(data, None, **kwargs)


def rec_diff(def_data,
             data_alt,
             data_set_names=("default", "custom"),
             s1_kwargs=immutabledict(dpe_offset=0.0),
             s2_kwargs=immutabledict(dpe_offset=0.0),
             ):
    make_diff = data_alt is not None
    data_sets = [def_data, data_alt] if make_diff else [def_data]

    f, axes = plt.subplots(2, len(data_sets), figsize=(18, 13))
    s1_kwargs, s2_kwargs = _rec_kwargs(s1_kwargs,
                                       s2_kwargs)
    axes = iter(axes.flatten())
    for axi, dat in enumerate(data_sets):
        plt.sca(next(axes))
        mask = (dat['type'] == 1) & (dat['outcome'] == 'found')
        _rec_diff_inner(dat[mask],
                        title=f'{data_set_names[axi]} S1 rec. bias',
                        **s1_kwargs)
        if axi == 0:
            plt.legend()
    for axi, dat in enumerate(data_sets):
        plt.sca(next(axes))
        mask = (dat['type'] == 2) & (dat['outcome'] == 'found')
        _rec_diff_inner(dat[mask],
                        title=f'{data_set_names[axi]} S2 rec. bias',
                        **s2_kwargs)
    return axes