Source code for pisa.utils.barlow

#!/usr/bin/python
"""
`Likelihoods` class for computing Poisson and Barlow likelihoods.

The Poisson likelihood assumes the template being compared against is the
perfect expectation, while the Barlow likelihood accounts for the template
being imperfect due to being generated from finite Monte Carlo statistics.
"""


from __future__ import absolute_import, division, print_function

from copy import copy
import sys

import numpy as np
from scipy.optimize import minimize

__all__ = ['Likelihoods']
__author__ = 'Michael Larson'
__email__ = 'mlarson@nbi.ku.dk'
__date__ = '2016-03-14'

__license__ = '''Copyright (c) 2014-2017, The IceCube Collaboration

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.'''


#------------------------------------------------------------------------------
# The following is adapted from user Alfe, https://stackoverflow.com/a/16787722
def handle_uncaught_exception(exctype, value, trace):
    """Exception hook that allows for exiting with a specified exit code.

    Replaces sys.exithook; see :class:`ExitCodeException`
    """
    OLD_EXCEPTHOOK(exctype, value, trace)
    if hasattr(value, 'exitcode'):
        sys.exit(value.exitcode)

sys.excepthook, OLD_EXCEPTHOOK = handle_uncaught_exception, sys.excepthook
#------------------------------------------------------------------------------


class ShapeError(Exception):
    exitcode = 100

class NaNValueError(Exception):
    exitcode = 101

class ArgValueError(Exception):
    exitcode = 102



[docs]
class Likelihoods(object):
    """
A class to handle the likelihood calculations in OscFit. It can
be used in other poisson binned-likelihood calculators as well.
The class includes two likelihood spaces: the Poisson and the
Barlow LLH.

The Poisson likelihood is the ideal case and assumes infinite
statistical accuracy on the Monte Carlo distributions. This is
the simple case and has been used in the past for analyses, but
should not be used if there's a significant statistical error on
one of the samples.

The Barlow likelihood is an implementation of the likelihood model
given in doi:10.1016/0010-4655(93)90005-W ("Fitting using finite
Monte Carlo samples" by Barlow and Beeston). This requires the
unweighted distributions of the MC and involves assuming that each
MC samples is drawn from an underlying "true" expectation distribution
which should be fit. This gives (number of bins)x(number of samples)
new parameters to fit and allows the shape of the distributions to
fluctuate in order to fit both the observed MC and the observed
data. This gives a more informed result and allows one to estimate
the effect of the finite MC samples.

To use this, you need to run set_data, set_mc, and the set_unweighted
functions. 

.. important:: the `set_mc` function takes a histogram of the average
    weight per event for each bin! not the total weighted histogram!

Simply calling the get_llh function after these will return the
best fit LLH for your chosen likelihood function. The function takes
the name of the likelihood space ("Poisson" or "Barlow"). You can
retrieve the best fit weighted plots using the get_plot (total best-fit
histogram including all samples) and the get_single_plots (the list
of best-fit weighted histograms for each sample).
    """
    mc_histograms = None
    unweighted_histograms = None
    data_histogram = None
    shape = None
    bestfit_plots = None
    current_bin = None

    def __init__(self):
        """Instantiate and set all of the defaults"""
        self.mc_histograms = None
        self.unweighted_histograms = None
        self.data_histogram = None
        self.shape = None
        self.bestfit_plots = None
        self.current_bin = None


[docs]
    def reset(self):
        """Re-instantiate so that we can reuse the object"""
        self.__init__()



[docs]
    def set_data(self, data_histogram):
        """Set up the data histogram. This histogram is flattened in order to
        make the looping later on a bit simpler to handle."""
        if not self.shape:
            self.shape = data_histogram.shape

        if data_histogram.shape != self.shape:
            raise ShapeError(
                "Data histogram has shape {} but expected histogram shape {}"
                .format(data_histogram.shape, self.shape),
                exitcode=100
            )

        self.data_histogram = data_histogram.flatten()



[docs]
    def set_mc(self, mc_histograms):
        """Set up the MC histogram. Each histogram is flattened in order to
        make the looping later on a bit simpler to handle. The values in each
        bin should correspond to the weight-per-event in that bin, NOT the
        total weight in that bin!

        Make sure you don't have any nulls here.

        """
        if not self.shape:
            self.shape = mc_histograms.values()[0].shape

        if np.any(np.isnan(mc_histograms)):
            raise NaNValueError(
                "At least one bin in your MC histogram is NaN! Take a look"
                " and decide how best to handle this",
                exitcode=101
            )

        flat_histograms = []
        for j in range(mc_histograms.shape[0]):
            if not self.shape == mc_histograms[j].shape:
                raise ShapeError(
                    "MC Histogram {} has shape {} but expected shape {}"
                    .format(j, mc_histograms[j].shape, self.shape)
                )

            flat_histograms.append(mc_histograms[j].flatten())

        self.mc_histograms = np.array(flat_histograms)



[docs]
    def set_unweighted(self, unweighted_histograms):
        """Save the unweighted distributions in the MC. These can include 0s."""
        if not self.shape:
            self.shape = unweighted_histograms.values()[0].shape

        flat_histograms = []
        for j in range(unweighted_histograms.shape[0]):
            if not self.shape == unweighted_histograms[j].shape:
                raise ShapeError(
                    "Unweighted histogram {} has shape {} but expected shape"
                    " {}"
                    .format(j, unweighted_histograms[j].shape, self.shape)
                )
            flat_histograms.append(unweighted_histograms[j].flatten())

            self.unweighted_histograms = np.array(flat_histograms)



[docs]
    def get_plot(self):
        """Get the total weighted best-fit histogram post-fit."""
        if self.bestfit_plots is None:
            return

        result = np.sum(self.get_single_plots(), axis=0)
        return result



[docs]
    def get_single_plots(self):
        """Get the individual weighted best-fit histograms post-fit."""
        if self.bestfit_plots is None:
            return None
        result = np.multiply(self.mc_histograms, self.bestfit_plots)
        target_shape = result.shape[0], self.shape[0], self.shape[1]
        return np.reshape(result, target_shape)



[docs]
    def get_llh(self, llh_type):
        """Calculate the likelihood given the data, average weights, and the
        unweighted histograms. You can choose between "Poisson" and "Barlow"
        likelihoods at the moment.

        If using the "Barlow" LLH, note that the method is picked to be Powell
        with 25 iterations maximum per step. This is not optimized at all and
        was explicitly chosen simply because it "worked". This doesn't work
        with the bounds set in the normal way, so the positive-definiteness of
        the rates is enforced in the get_llh_barlow_bin method.

        """
        llh_type = llh_type.lower()
        self.bestfit_plots = copy(self.unweighted_histograms)
        self.current_bin = 0

        # The simplest case: the Poisson binned likelihood
        if llh_type == "poisson":
            poisson_llh = self.get_llh_poisson()
            return poisson_llh

        # The more complicated case: The Barlow LLH
        # This requires a separate minimization step in each bin to estimate
        #  the expected rate in each bin from each MC sample using constraints
        #  from the data and the observed MC distribution.
        elif llh_type == "barlow":
            llh = 0
            for bin_n in range(len(self.data_histogram)):
                self.current_bin = bin_n
                bin_result = minimize(
                    fun=self.get_llh_barlow_bin,
                    x0=self.bestfit_plots[:, bin_n],
                    method="Powell",
                    options={'maxiter': 25, 'disp': False}
                )
                self.bestfit_plots[:, bin_n] = bin_result.x
                llh += bin_result.fun

            self.current_bin = None
            return llh

        raise ArgValueError(
            'Unknown `llh_type` "{}". Choose either "Poisson" (ideal) or'
            ' "Barlow" (including MC statistical errors).'
            .format(llh_type)
        )



[docs]
    def get_llh_barlow_bin(self, a_i):
        """The Barlow LLH finds the best-fit "expected" MC distribution using
        both the data and observed MC as constraints. Each bin is independent
        in this calculation, since the assumption is that there are no
        correlations between bins. This likely leads to a somewhat worse limit
        than you might get otherwise, but at least its conservative.

        If you have a good idea for how to extend this to include bin-to-bin,
        let me know and I can help implement it.

        """
        if any([a_i[j] < 0 for j in range(len(a_i))]):
            return 1e10
        i = self.current_bin
        di = self.data_histogram[i]
        fi = np.sum(np.multiply(self.mc_histograms[:, i], a_i))
        ai = self.unweighted_histograms[:, i]

        llh = 0

        # This is the standard Poisson LLH comparing data to the total weighted
        # MC
        if fi > 0:
            llh += di * np.log(fi) - fi
        if di > 0:
            llh -= di * np.log(di) - di

        # The heart of the Barlow LLH. Fitting the a_i (expected number of
        # events in the MC sample for this bin) using the observed MC events as
        # a constraint.
        cut = a_i > 0
        #a_i[a_i <= 0] = 10e-10
        #llh += np.sum(np.dot(ai, np.log(a_i)) - np.sum(a_i))
        llh += np.sum(np.dot(ai[cut], np.log(a_i[cut])) - np.sum(a_i[cut]))

        # This is simply a normalization term that helps by centering the LLH
        # near 0
        # It's an expansion of Ln(ai!) using the Sterling expansion
        cut = ai > 0
        llh -= np.sum(np.dot(ai[cut], np.log(ai[cut])) - np.sum(ai[cut]))

        return -llh



[docs]
    def get_llh_poisson(self):
        """The standard binned-poisson likelihood comparing the weighted MC
        distribution to the data, ignoring MC statistical uncertainties."""
        di = self.data_histogram
        fi = np.sum(np.multiply(self.mc_histograms,
                                self.unweighted_histograms), axis=0)
        llh = 0

        # The normal definition of the LLH, dropping the Ln(di!) constant term
        cut = fi > 0
        llh += np.sum(di[cut] * np.log(fi[cut]) - fi[cut])

        # This is simply a normalization term that helps by centering the LLH
        # near 0
        # It's an expansion of Ln(di!) using the Sterling expansion
        cut = di > 0
        llh -= np.sum(di[cut] * np.log(di[cut]) - di[cut])

        return -llh