Skip to content

Commit

Permalink
feat: Add script to validate norm + histo systematics (#13)
Browse files Browse the repository at this point in the history
* Add python script which allows the user to visualize the sizes of norm + histo systs relative to nominal
* Reads in background-only json file and signal patches
* Accepts command-line arguments to adapt the script for different analyses
* Validation script also included to test with published sbottom likelihoods
  • Loading branch information
danikam committed May 6, 2020
1 parent b3e813e commit 97cc34c
Show file tree
Hide file tree
Showing 3 changed files with 383 additions and 0 deletions.
362 changes: 362 additions & 0 deletions scripts/validate_systs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,362 @@
"""
Date: 200325
History:
- Originally written as a python notebook by Lukas Heinrich
- Adapted for the 3L-RJ likelihood validation by Giordon Stark (https://github.com/kratsg/3L-RJ-mimic-likelihood-validation/blob/master/OutlierPlot.ipynb)
- Adapted and generalized to script by Danika MacDonell [March 25, 2020]
Details: Script to visualize the relative size of the systematics for pyhf likelihoods at each mass point, where the systematics are added together in quadrature. Should be run in a directory containing the background-only json likelihood file, along with a patch json likelihood file for each signal point. Assumes that all the json patch file are located at the same directory level as this script.
Usage:
>> python OutlierPlot.py --signal_template <signal_template_{a}_{b}_{c}_for_masses> --x_var <which variable in signal name template to plot on x axis (defaults to 'a')> --y_var <which variable in signal name template to plot on x axis (defaults to 'b')> --v_max <max colourbar amplitude> --x_label <x axis label> --y_label <y axis label>
Example for 1Lbb Wh analysis (https://glance.cern.ch/atlas/analysis/analyses/details.php?id=2969):
>> python OutlierPlot.py --signal_template C1N2_Wh_hbb_{a}_{b} --x_var a --y_var b --v_max 10 --x_label '$m(\tilde{\chi}_{1}^{\pm}/\tilde{\chi}_{2}^{0})$ [GeV]' --y_label '$m(\tilde{\chi}_{1}^{0})$ [GeV]'
The signal template is the name of an arbitrary signal in the json patch files, with the signal masses left as {}. Given a background-only file and a patch file, the signal name can be found under "samples" in the output of:
>> jsonpatch BkgOnly.json patch_XXX.json | pyhf inspect
If, for example, one of the signals is called C1N2_Wh_hbb_550_200, where 550 and 200 are the variable model masses, the signal template would be C1N2_Wh_hbb_{}_{}.
"""

import json
import glob
import numpy as np
import matplotlib.pyplot as plt
import scipy.interpolate
import parse
import click
import pyhf

plt.rc("xtick", labelsize=14)
plt.rc("ytick", labelsize=14)


def handle_deltas(delta_up, delta_dn):
nom_is_center = np.bitwise_or(
np.bitwise_and(delta_up > 0, delta_dn > 0),
np.bitwise_and(delta_up <= 0, delta_dn <= 0),
)
span = delta_dn + delta_up
maxdel = np.maximum(np.abs(delta_dn), np.abs(delta_dn))
abs_unc = np.where(nom_is_center, span, maxdel)
return abs_unc


def process_patch(p):
nom = np.asarray(p["value"]["data"])

# histosys
hid = np.asarray(
[
m["data"]["hi_data"]
for m in p["value"]["modifiers"]
if m["type"] == "histosys"
]
)
lod = np.asarray(
[
m["data"]["lo_data"]
for m in p["value"]["modifiers"]
if m["type"] == "histosys"
]
)
delta_up = hid - nom
delta_dn = nom - lod
histo_deltas = handle_deltas(delta_up, delta_dn)

hi = np.asarray(
[m["data"]["hi"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
)
lo = np.asarray(
[m["data"]["lo"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
)
delta_up = np.asarray([h * nom - nom for h in hi])
delta_dn = np.asarray([l * nom - nom for l in lo])
norm_deltas = handle_deltas(delta_up, delta_dn)

stat_deltas = np.zeros_like(
histo_deltas
) # Don't consider the stat error for this calculation

systs = np.concatenate([histo_deltas, norm_deltas, stat_deltas])
inquad = np.sqrt(np.sum(np.square(systs), axis=0))
rel = inquad / nom
rel = np.where(nom == 0, np.ones_like(nom), rel)
return rel, nom


def plot_rel_systs(p, channel_names, channel_bins):

signal_name = p["value"]["name"]
channel_name = channel_names[p["path"]]
nom = np.asarray(p["value"]["data"])
n_bins = channel_bins[p["path"]]

# Collect syst names and deltas for histo systs
sys_names_histo = [
np.asarray(
[m["name"] for m in p["value"]["modifiers"] if m["type"] == "histosys"]
)
][0].tolist()
hid = np.asarray(
[
m["data"]["hi_data"]
for m in p["value"]["modifiers"]
if m["type"] == "histosys"
]
)
lod = np.asarray(
[
m["data"]["lo_data"]
for m in p["value"]["modifiers"]
if m["type"] == "histosys"
]
)
delta_up = hid - nom
delta_dn = nom - lod
histo_deltas = handle_deltas(delta_up, delta_dn)
histo_rel_size = np.absolute(histo_deltas) / (1.0 * nom)

# Collect syst names and deltas for norm systs
sys_names_norm = [
np.asarray(
[m["name"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
)
][0].tolist()
hi = np.asarray(
[m["data"]["hi"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
)
lo = np.asarray(
[m["data"]["lo"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
)
delta_up = np.asarray([h * nom - nom for h in hi])
delta_dn = np.asarray([l * nom - nom for l in lo])
norm_deltas = handle_deltas(delta_up, delta_dn)
norm_rel_size = np.absolute(norm_deltas) / (1.0 * nom)

sys_names = sys_names_histo + sys_names_norm

for iBin in range(n_bins):
bin_number = iBin + 1
rel_size = np.concatenate((histo_rel_size[:, iBin], norm_rel_size[:, iBin]))
sys_names_bin = [
sys_names[idx]
for idx, size in enumerate(rel_size)
if size > 0.5 and np.isfinite(size)
] # Select for relative systematics above 0.5 (i.e. >50% variation from nominal)
rel_size = rel_size[(rel_size > 0.5) & (np.isfinite(rel_size))]

# Only make a plot if there are any relative systematics above 0.5
if len(sys_names_bin) > 0:
x = np.arange(len(sys_names_bin)) # the label locations
width = 0.5 # the width of the bars
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(1, 1, 1)
ax.set_ylabel("Relative Syst Size", fontsize=16)
ax.set_title(
f"{signal_name}: {channel_name} (Bin {bin_number}), Norm & Histo",
fontsize=20,
)
ax.bar(x - width / 2, rel_size, width)
ax.set_xticks(x)
ax.set_xticklabels(sys_names_bin, rotation=45, ha="right")
plt.tight_layout()
plt.savefig(
f"Plots/rel_systs_{signal_name}_{channel_name}_bin{bin_number}.png"
)
plt.close()


@click.command()
@click.option(
"--signal_template",
help="Signal name template, with signal masses as variables. Signal masses must be denoted by {a}, {b}, {c}, ... (eg. signal_{a}_{b}_more_info)",
)
@click.option(
"--x_var",
help="Which variable from the signal name template to plot on the x axis (defaults to 'a')",
default="a",
required=False,
)
@click.option(
"--y_var",
help="Which variable from the signal name template to plot on the y axis (defaults to 'b')",
default="b",
required=False,
)
@click.option(
"--v_max",
help="Maximum amplitude of the colourbar for plotting interpolated relative syst sizes",
default=10,
required=False,
)
@click.option(
"--x_label",
help="x label for interpolated plot of relative systematics",
default=None,
required=False,
)
@click.option(
"--y_label",
help="y label for interpolated plot of relative systematics",
default=None,
required=False,
)
def outlier_plot(signal_template, v_max, x_var, y_var, x_label, y_label):

patches = [json.load(open(x)) for x in glob.glob("patch*.json")]

data = {
x[0]["value"]["name"]: {
p["path"]: process_patch(p) for p in x if p["op"] == "add"
}
for x in patches
if "value" in x[0]
}

# Make the mapping of json channel names to analysis region names
listOfPatches = glob.glob("patch*.json")
spec_sig = json.load(open(listOfPatches[0]))
spec_bkg = json.load(open("BkgOnly.json"))

names_json = []
for ichan, channel in enumerate(spec_bkg["channels"]):
names_json.append(channel["name"])

channels_json = []
for ichan, channel in enumerate(spec_sig):
channels_json.append(channel["path"])

channel_names = dict(zip(channels_json, names_json))

# Make the mapping of number of json channel name to number of bins
workspace_bkg = pyhf.Workspace(json.load(open("BkgOnly.json")))
channel_bins = {}
for key, value in channel_names.items():
channel_bins[key] = workspace_bkg.channel_nbins[value]

outliers = []
for k, v in data.items():
for kk, vv in v.items():
for b, (r, n) in enumerate(zip(*vv)):
if r > 1.0:
outliers.append((k, kk, b, r, n))

# Make plots of relative syst for each signal point and bin
for x in patches:
if "value" in x[0]:
for p in x:
if p["op"] == "add":
plot_rel_systs(p, channel_names, channel_bins)

print("Outliers (> 1.0):")
for o in list(reversed(sorted(outliers, key=lambda x: x[-1]))):
print("\t", o[-1], o[-2], o[0], channel_names[o[1]], o[2])
"""
missing_signal = []
# missing signal in signal region
print("Missing signal in signal region:")
for k, v in data.items():
if not '/channels/2/samples/5' in v or not '/channels/2/samples/5' in v:
missing_signal.append(k)
print('\t',k)
"""

data = {
x[0]["value"]["name"]: {
p["path"]: process_patch(p)[0] for p in x if p["op"] == "add"
}
for x in patches
if "value" in x[0]
}

sig_name_template = signal_template

for ichan, channel in enumerate(channels_json):
rel_systs = np.asarray(
[
[
float(m)
for m in [
parse.parse(sig_name_template, k).named[x_var],
parse.parse(sig_name_template, k).named[y_var],
]
]
+ v.get(channel, np.array([0.0] * channel_bins[channel])).tolist()
for k, v in data.items()
]
)
x_min, x_max = min(rel_systs[:, 0]), max(rel_systs[:, 0])
y_min, y_max = min(rel_systs[:, 1]), max(rel_systs[:, 1])
x, y = np.mgrid[x_min:x_max:100j, y_min:y_max:100j]
for jbin in range(2, 2 + channel_bins[channel]):
f = plt.figure(figsize=(12, 6))
ax = f.add_subplot(1, 1, 1)
if x_label is not None:
ax.set_xlabel(x_label, fontsize=20)
if y_label is not None:
ax.set_ylabel(y_label, fontsize=20)
ax.set_xlim(x_min - 25, x_max + 25)
ax.set_ylim(y_min - 25, y_max + 25)
bin_number = jbin - 1
rel_systs = rel_systs[
rel_systs[:, jbin] != 0
] # Remove any points with zero relative syst

if len(rel_systs) > 0:
z = scipy.interpolate.griddata(
rel_systs[:, :2], rel_systs[:, jbin], (x, y)
)

vmin, vmax = 0, v_max
ax.scatter(
rel_systs[:, 0],
rel_systs[:, 1],
c=rel_systs[:, jbin],
edgecolors="w",
vmin=vmin,
vmax=vmax,
)
im = ax.contourf(x, y, z, levels=np.linspace(vmin, vmax, 100))
cb = plt.colorbar(im, ax=ax)
cb.set_label(label="$\oplus$ (histosys, normsys, staterr)", fontsize=18)
if channel_bins[channel] < 2:
ax.set_title(channel_names[channel], fontsize=20)
else:
ax.set_title(
f"{channel_names[channel]} (Bin {bin_number})", fontsize=20
)

outliers_chan = np.asarray(
[
[
float(parse.parse(sig_name_template, o[0]).named[x_var]),
float(parse.parse(sig_name_template, o[0]).named[y_var]),
]
+ [o[-2]]
for o in outliers
if o[1] == channel and o[2] == jbin - 2
]
)

if outliers_chan.shape[0]:
ax.scatter(
outliers_chan[:, 0],
outliers_chan[:, 1],
c=outliers_chan[:, 2],
vmin=0,
vmax=20,
cmap="cool",
)
for o in outliers_chan:
ax.text(o[0] + 5, o[1] + 5, "{:.2f}".format(o[2]), c="r")

plt.tight_layout()
plt.savefig(f"Plots/{channel_names[channel]}_bin{bin_number}.png")
plt.close()


if __name__ == "__main__":
outlier_plot()
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"black",
"twine",
],
"test": ["matplotlib",],
}
extras_require["complete"] = sorted(set(sum(extras_require.values(), [])))

Expand Down
20 changes: 20 additions & 0 deletions tests/test_syst_validation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
# Script to test validate_systs.py with test input located in test_syst_validation_input. This script has been tested in the python:3.7 docker container with pyhf, parse, and matplotlib installed
# Note: this script is assumed to be run from one level above the directory containing the script.

cd tests || exit

# Download the published likelihoods from the sbottom publication
wget -O sbottom.tar.gz https://www.hepdata.net/record/resource/997020?view=true

# Grab the likelihoods from one of the analysis regions
tar -xvzf sbottom.tar.gz RegionA
rm sbottom.tar.gz

# Change the name of the dir to something more sensible for the validation output
mv RegionA test_syst_validation_dir
cd test_syst_validation_dir || exit
mkdir -p Plots

# Run the validate_systs.py script on the test sbottom likelihoods
python ../../scripts/validate_systs.py --signal_template "sbottom_{a}_{b}_{c}" --x_var a --y_var b --v_max 4 --x_label 'x label' --y_label 'y label'

0 comments on commit 97cc34c

Please sign in to comment.