#!/usr/bin/python
"""
`Likelihoods` class for computing Poisson and Barlow likelihoods.
The Poisson likelihood assumes the template being compared against is the
perfect expectation, while the Barlow likelihood accounts for the template
being imperfect due to being generated from finite Monte Carlo statistics.
"""
from __future__ import absolute_import, division, print_function
from copy import copy
import sys
import numpy as np
from scipy.optimize import minimize
__all__ = ['Likelihoods']
__author__ = 'Michael Larson'
__email__ = 'mlarson@nbi.ku.dk'
__date__ = '2016-03-14'
__license__ = '''Copyright (c) 2014-2017, The IceCube Collaboration
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.'''
#------------------------------------------------------------------------------
# The following is adapted from user Alfe, https://stackoverflow.com/a/16787722
def handle_uncaught_exception(exctype, value, trace):
"""Exception hook that allows for exiting with a specified exit code.
Replaces sys.exithook; see :class:`ExitCodeException`
"""
OLD_EXCEPTHOOK(exctype, value, trace)
if hasattr(value, 'exitcode'):
sys.exit(value.exitcode)
sys.excepthook, OLD_EXCEPTHOOK = handle_uncaught_exception, sys.excepthook
#------------------------------------------------------------------------------
class ShapeError(Exception):
exitcode = 100
class NaNValueError(Exception):
exitcode = 101
class ArgValueError(Exception):
exitcode = 102
[docs]
class Likelihoods(object):
"""
A class to handle the likelihood calculations in OscFit. It can
be used in other poisson binned-likelihood calculators as well.
The class includes two likelihood spaces: the Poisson and the
Barlow LLH.
The Poisson likelihood is the ideal case and assumes infinite
statistical accuracy on the Monte Carlo distributions. This is
the simple case and has been used in the past for analyses, but
should not be used if there's a significant statistical error on
one of the samples.
The Barlow likelihood is an implementation of the likelihood model
given in doi:10.1016/0010-4655(93)90005-W ("Fitting using finite
Monte Carlo samples" by Barlow and Beeston). This requires the
unweighted distributions of the MC and involves assuming that each
MC samples is drawn from an underlying "true" expectation distribution
which should be fit. This gives (number of bins)x(number of samples)
new parameters to fit and allows the shape of the distributions to
fluctuate in order to fit both the observed MC and the observed
data. This gives a more informed result and allows one to estimate
the effect of the finite MC samples.
To use this, you need to run set_data, set_mc, and the set_unweighted
functions.
.. important:: the `set_mc` function takes a histogram of the average
weight per event for each bin! not the total weighted histogram!
Simply calling the get_llh function after these will return the
best fit LLH for your chosen likelihood function. The function takes
the name of the likelihood space ("Poisson" or "Barlow"). You can
retrieve the best fit weighted plots using the get_plot (total best-fit
histogram including all samples) and the get_single_plots (the list
of best-fit weighted histograms for each sample).
"""
mc_histograms = None
unweighted_histograms = None
data_histogram = None
shape = None
bestfit_plots = None
current_bin = None
def __init__(self):
"""Instantiate and set all of the defaults"""
self.mc_histograms = None
self.unweighted_histograms = None
self.data_histogram = None
self.shape = None
self.bestfit_plots = None
self.current_bin = None
[docs]
def reset(self):
"""Re-instantiate so that we can reuse the object"""
self.__init__()
[docs]
def set_data(self, data_histogram):
"""Set up the data histogram. This histogram is flattened in order to
make the looping later on a bit simpler to handle."""
if not self.shape:
self.shape = data_histogram.shape
if data_histogram.shape != self.shape:
raise ShapeError(
"Data histogram has shape {} but expected histogram shape {}"
.format(data_histogram.shape, self.shape),
exitcode=100
)
self.data_histogram = data_histogram.flatten()
[docs]
def set_mc(self, mc_histograms):
"""Set up the MC histogram. Each histogram is flattened in order to
make the looping later on a bit simpler to handle. The values in each
bin should correspond to the weight-per-event in that bin, NOT the
total weight in that bin!
Make sure you don't have any nulls here.
"""
if not self.shape:
self.shape = mc_histograms.values()[0].shape
if np.any(np.isnan(mc_histograms)):
raise NaNValueError(
"At least one bin in your MC histogram is NaN! Take a look"
" and decide how best to handle this",
exitcode=101
)
flat_histograms = []
for j in range(mc_histograms.shape[0]):
if not self.shape == mc_histograms[j].shape:
raise ShapeError(
"MC Histogram {} has shape {} but expected shape {}"
.format(j, mc_histograms[j].shape, self.shape)
)
flat_histograms.append(mc_histograms[j].flatten())
self.mc_histograms = np.array(flat_histograms)
[docs]
def set_unweighted(self, unweighted_histograms):
"""Save the unweighted distributions in the MC. These can include 0s."""
if not self.shape:
self.shape = unweighted_histograms.values()[0].shape
flat_histograms = []
for j in range(unweighted_histograms.shape[0]):
if not self.shape == unweighted_histograms[j].shape:
raise ShapeError(
"Unweighted histogram {} has shape {} but expected shape"
" {}"
.format(j, unweighted_histograms[j].shape, self.shape)
)
flat_histograms.append(unweighted_histograms[j].flatten())
self.unweighted_histograms = np.array(flat_histograms)
[docs]
def get_plot(self):
"""Get the total weighted best-fit histogram post-fit."""
if self.bestfit_plots is None:
return
result = np.sum(self.get_single_plots(), axis=0)
return result
[docs]
def get_single_plots(self):
"""Get the individual weighted best-fit histograms post-fit."""
if self.bestfit_plots is None:
return None
result = np.multiply(self.mc_histograms, self.bestfit_plots)
target_shape = result.shape[0], self.shape[0], self.shape[1]
return np.reshape(result, target_shape)
[docs]
def get_llh(self, llh_type):
"""Calculate the likelihood given the data, average weights, and the
unweighted histograms. You can choose between "Poisson" and "Barlow"
likelihoods at the moment.
If using the "Barlow" LLH, note that the method is picked to be Powell
with 25 iterations maximum per step. This is not optimized at all and
was explicitly chosen simply because it "worked". This doesn't work
with the bounds set in the normal way, so the positive-definiteness of
the rates is enforced in the get_llh_barlow_bin method.
"""
llh_type = llh_type.lower()
self.bestfit_plots = copy(self.unweighted_histograms)
self.current_bin = 0
# The simplest case: the Poisson binned likelihood
if llh_type == "poisson":
poisson_llh = self.get_llh_poisson()
return poisson_llh
# The more complicated case: The Barlow LLH
# This requires a separate minimization step in each bin to estimate
# the expected rate in each bin from each MC sample using constraints
# from the data and the observed MC distribution.
elif llh_type == "barlow":
llh = 0
for bin_n in range(len(self.data_histogram)):
self.current_bin = bin_n
bin_result = minimize(
fun=self.get_llh_barlow_bin,
x0=self.bestfit_plots[:, bin_n],
method="Powell",
options={'maxiter': 25, 'disp': False}
)
self.bestfit_plots[:, bin_n] = bin_result.x
llh += bin_result.fun
self.current_bin = None
return llh
raise ArgValueError(
'Unknown `llh_type` "{}". Choose either "Poisson" (ideal) or'
' "Barlow" (including MC statistical errors).'
.format(llh_type)
)
[docs]
def get_llh_barlow_bin(self, a_i):
"""The Barlow LLH finds the best-fit "expected" MC distribution using
both the data and observed MC as constraints. Each bin is independent
in this calculation, since the assumption is that there are no
correlations between bins. This likely leads to a somewhat worse limit
than you might get otherwise, but at least its conservative.
If you have a good idea for how to extend this to include bin-to-bin,
let me know and I can help implement it.
"""
if any([a_i[j] < 0 for j in range(len(a_i))]):
return 1e10
i = self.current_bin
di = self.data_histogram[i]
fi = np.sum(np.multiply(self.mc_histograms[:, i], a_i))
ai = self.unweighted_histograms[:, i]
llh = 0
# This is the standard Poisson LLH comparing data to the total weighted
# MC
if fi > 0:
llh += di * np.log(fi) - fi
if di > 0:
llh -= di * np.log(di) - di
# The heart of the Barlow LLH. Fitting the a_i (expected number of
# events in the MC sample for this bin) using the observed MC events as
# a constraint.
cut = a_i > 0
#a_i[a_i <= 0] = 10e-10
#llh += np.sum(np.dot(ai, np.log(a_i)) - np.sum(a_i))
llh += np.sum(np.dot(ai[cut], np.log(a_i[cut])) - np.sum(a_i[cut]))
# This is simply a normalization term that helps by centering the LLH
# near 0
# It's an expansion of Ln(ai!) using the Sterling expansion
cut = ai > 0
llh -= np.sum(np.dot(ai[cut], np.log(ai[cut])) - np.sum(ai[cut]))
return -llh
[docs]
def get_llh_poisson(self):
"""The standard binned-poisson likelihood comparing the weighted MC
distribution to the data, ignoring MC statistical uncertainties."""
di = self.data_histogram
fi = np.sum(np.multiply(self.mc_histograms,
self.unweighted_histograms), axis=0)
llh = 0
# The normal definition of the LLH, dropping the Ln(di!) constant term
cut = fi > 0
llh += np.sum(di[cut] * np.log(fi[cut]) - fi[cut])
# This is simply a normalization term that helps by centering the LLH
# near 0
# It's an expansion of Ln(di!) using the Sterling expansion
cut = di > 0
llh -= np.sum(di[cut] * np.log(di[cut]) - di[cut])
return -llh