feat: Add script to validate norm + histo systematics (#13)

* Add python script which allows the user to visualize the sizes of norm + histo systs relative to nominal * Reads in background-only json file and signal patches * Accepts command-line arguments to adapt the script for different analyses * Validation script also included to test with published sbottom likelihoods
pyhf · May 6, 2020 · 97cc34c · 97cc34c
1 parent b3e813e
commit 97cc34c
Show file tree

Hide file tree

Showing 3 changed files with 383 additions and 0 deletions.
diff --git a/scripts/validate_systs.py b/scripts/validate_systs.py
@@ -0,0 +1,362 @@
+"""
+  Date:     200325
+  History:  
+    - Originally written as a python notebook by Lukas Heinrich
+    - Adapted for the 3L-RJ likelihood validation by Giordon Stark (https://github.com/kratsg/3L-RJ-mimic-likelihood-validation/blob/master/OutlierPlot.ipynb)
+    - Adapted and generalized to script by Danika MacDonell [March 25, 2020]
+    
+  Details:  Script to visualize the relative size of the systematics for pyhf likelihoods at each mass point, where the systematics are added together in quadrature. Should be run in a directory containing the background-only json likelihood file, along with a patch json likelihood file for each signal point. Assumes that all the json patch file are located at the same directory level as this script.
+  
+  Usage:
+  >> python OutlierPlot.py --signal_template <signal_template_{a}_{b}_{c}_for_masses> --x_var <which variable in signal name template to plot on x axis (defaults to 'a')> --y_var <which variable in signal name template to plot on x axis (defaults to 'b')> --v_max <max colourbar amplitude> --x_label <x axis label> --y_label <y axis label>
+  
+  Example for 1Lbb Wh analysis (https://glance.cern.ch/atlas/analysis/analyses/details.php?id=2969):
+  
+  >> python OutlierPlot.py --signal_template C1N2_Wh_hbb_{a}_{b} --x_var a --y_var b --v_max 10 --x_label '$m(\tilde{\chi}_{1}^{\pm}/\tilde{\chi}_{2}^{0})$ [GeV]' --y_label '$m(\tilde{\chi}_{1}^{0})$ [GeV]'
+  
+  The signal template is the name of an arbitrary signal in the json patch files, with the signal masses left as {}. Given a background-only file and a patch file, the signal name can be found under "samples" in the output of:
+  >> jsonpatch BkgOnly.json patch_XXX.json | pyhf inspect
+  
+  If, for example, one of the signals is called C1N2_Wh_hbb_550_200, where 550 and 200 are the variable model masses, the signal template would be C1N2_Wh_hbb_{}_{}.
+  """
+
+import json
+import glob
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.interpolate
+import parse
+import click
+import pyhf
+
+plt.rc("xtick", labelsize=14)
+plt.rc("ytick", labelsize=14)
+
+
+def handle_deltas(delta_up, delta_dn):
+    nom_is_center = np.bitwise_or(
+        np.bitwise_and(delta_up > 0, delta_dn > 0),
+        np.bitwise_and(delta_up <= 0, delta_dn <= 0),
+    )
+    span = delta_dn + delta_up
+    maxdel = np.maximum(np.abs(delta_dn), np.abs(delta_dn))
+    abs_unc = np.where(nom_is_center, span, maxdel)
+    return abs_unc
+
+
+def process_patch(p):
+    nom = np.asarray(p["value"]["data"])
+
+    # histosys
+    hid = np.asarray(
+        [
+            m["data"]["hi_data"]
+            for m in p["value"]["modifiers"]
+            if m["type"] == "histosys"
+        ]
+    )
+    lod = np.asarray(
+        [
+            m["data"]["lo_data"]
+            for m in p["value"]["modifiers"]
+            if m["type"] == "histosys"
+        ]
+    )
+    delta_up = hid - nom
+    delta_dn = nom - lod
+    histo_deltas = handle_deltas(delta_up, delta_dn)
+
+    hi = np.asarray(
+        [m["data"]["hi"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
+    )
+    lo = np.asarray(
+        [m["data"]["lo"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
+    )
+    delta_up = np.asarray([h * nom - nom for h in hi])
+    delta_dn = np.asarray([l * nom - nom for l in lo])
+    norm_deltas = handle_deltas(delta_up, delta_dn)
+
+    stat_deltas = np.zeros_like(
+        histo_deltas
+    )  # Don't consider the stat error for this calculation
+
+    systs = np.concatenate([histo_deltas, norm_deltas, stat_deltas])
+    inquad = np.sqrt(np.sum(np.square(systs), axis=0))
+    rel = inquad / nom
+    rel = np.where(nom == 0, np.ones_like(nom), rel)
+    return rel, nom
+
+
+def plot_rel_systs(p, channel_names, channel_bins):
+
+    signal_name = p["value"]["name"]
+    channel_name = channel_names[p["path"]]
+    nom = np.asarray(p["value"]["data"])
+    n_bins = channel_bins[p["path"]]
+
+    # Collect syst names and deltas for histo systs
+    sys_names_histo = [
+        np.asarray(
+            [m["name"] for m in p["value"]["modifiers"] if m["type"] == "histosys"]
+        )
+    ][0].tolist()
+    hid = np.asarray(
+        [
+            m["data"]["hi_data"]
+            for m in p["value"]["modifiers"]
+            if m["type"] == "histosys"
+        ]
+    )
+    lod = np.asarray(
+        [
+            m["data"]["lo_data"]
+            for m in p["value"]["modifiers"]
+            if m["type"] == "histosys"
+        ]
+    )
+    delta_up = hid - nom
+    delta_dn = nom - lod
+    histo_deltas = handle_deltas(delta_up, delta_dn)
+    histo_rel_size = np.absolute(histo_deltas) / (1.0 * nom)
+
+    # Collect syst names and deltas for norm systs
+    sys_names_norm = [
+        np.asarray(
+            [m["name"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
+        )
+    ][0].tolist()
+    hi = np.asarray(
+        [m["data"]["hi"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
+    )
+    lo = np.asarray(
+        [m["data"]["lo"] for m in p["value"]["modifiers"] if m["type"] == "normsys"]
+    )
+    delta_up = np.asarray([h * nom - nom for h in hi])
+    delta_dn = np.asarray([l * nom - nom for l in lo])
+    norm_deltas = handle_deltas(delta_up, delta_dn)
+    norm_rel_size = np.absolute(norm_deltas) / (1.0 * nom)
+
+    sys_names = sys_names_histo + sys_names_norm
+
+    for iBin in range(n_bins):
+        bin_number = iBin + 1
+        rel_size = np.concatenate((histo_rel_size[:, iBin], norm_rel_size[:, iBin]))
+        sys_names_bin = [
+            sys_names[idx]
+            for idx, size in enumerate(rel_size)
+            if size > 0.5 and np.isfinite(size)
+        ]  # Select for relative systematics above 0.5 (i.e. >50% variation from nominal)
+        rel_size = rel_size[(rel_size > 0.5) & (np.isfinite(rel_size))]
+
+        # Only make a plot if there are any relative systematics above 0.5
+        if len(sys_names_bin) > 0:
+            x = np.arange(len(sys_names_bin))  # the label locations
+            width = 0.5  # the width of the bars
+            fig = plt.figure(figsize=(12, 6))
+            ax = fig.add_subplot(1, 1, 1)
+            ax.set_ylabel("Relative Syst Size", fontsize=16)
+            ax.set_title(
+                f"{signal_name}: {channel_name} (Bin {bin_number}), Norm & Histo",
+                fontsize=20,
+            )
+            ax.bar(x - width / 2, rel_size, width)
+            ax.set_xticks(x)
+            ax.set_xticklabels(sys_names_bin, rotation=45, ha="right")
+            plt.tight_layout()
+            plt.savefig(
+                f"Plots/rel_systs_{signal_name}_{channel_name}_bin{bin_number}.png"
+            )
+            plt.close()
+
+
+@click.command()
+@click.option(
+    "--signal_template",
+    help="Signal name template, with signal masses as variables. Signal masses must be denoted by {a}, {b}, {c}, ... (eg. signal_{a}_{b}_more_info)",
+)
+@click.option(
+    "--x_var",
+    help="Which variable from the signal name template to plot on the x axis (defaults to 'a')",
+    default="a",
+    required=False,
+)
+@click.option(
+    "--y_var",
+    help="Which variable from the signal name template to plot on the y axis (defaults to 'b')",
+    default="b",
+    required=False,
+)
+@click.option(
+    "--v_max",
+    help="Maximum amplitude of the colourbar for plotting interpolated relative syst sizes",
+    default=10,
+    required=False,
+)
+@click.option(
+    "--x_label",
+    help="x label for interpolated plot of relative systematics",
+    default=None,
+    required=False,
+)
+@click.option(
+    "--y_label",
+    help="y label for interpolated plot of relative systematics",
+    default=None,
+    required=False,
+)
+def outlier_plot(signal_template, v_max, x_var, y_var, x_label, y_label):
+
+    patches = [json.load(open(x)) for x in glob.glob("patch*.json")]
+
+    data = {
+        x[0]["value"]["name"]: {
+            p["path"]: process_patch(p) for p in x if p["op"] == "add"
+        }
+        for x in patches
+        if "value" in x[0]
+    }
+
+    # Make the mapping of json channel names to analysis region names
+    listOfPatches = glob.glob("patch*.json")
+    spec_sig = json.load(open(listOfPatches[0]))
+    spec_bkg = json.load(open("BkgOnly.json"))
+
+    names_json = []
+    for ichan, channel in enumerate(spec_bkg["channels"]):
+        names_json.append(channel["name"])
+
+    channels_json = []
+    for ichan, channel in enumerate(spec_sig):
+        channels_json.append(channel["path"])
+
+    channel_names = dict(zip(channels_json, names_json))
+
+    # Make the mapping of number of json channel name to number of bins
+    workspace_bkg = pyhf.Workspace(json.load(open("BkgOnly.json")))
+    channel_bins = {}
+    for key, value in channel_names.items():
+        channel_bins[key] = workspace_bkg.channel_nbins[value]
+
+    outliers = []
+    for k, v in data.items():
+        for kk, vv in v.items():
+            for b, (r, n) in enumerate(zip(*vv)):
+                if r > 1.0:
+                    outliers.append((k, kk, b, r, n))
+
+    # Make plots of relative syst for each signal point and bin
+    for x in patches:
+        if "value" in x[0]:
+            for p in x:
+                if p["op"] == "add":
+                    plot_rel_systs(p, channel_names, channel_bins)
+
+    print("Outliers (> 1.0):")
+    for o in list(reversed(sorted(outliers, key=lambda x: x[-1]))):
+        print("\t", o[-1], o[-2], o[0], channel_names[o[1]], o[2])
+    """
+    missing_signal = []
+    # missing signal in signal region
+    print("Missing signal in signal region:")
+    for k, v in data.items():
+        if not '/channels/2/samples/5' in v or not '/channels/2/samples/5' in v:
+            missing_signal.append(k)
+            print('\t',k)
+    """
+
+    data = {
+        x[0]["value"]["name"]: {
+            p["path"]: process_patch(p)[0] for p in x if p["op"] == "add"
+        }
+        for x in patches
+        if "value" in x[0]
+    }
+
+    sig_name_template = signal_template
+
+    for ichan, channel in enumerate(channels_json):
+        rel_systs = np.asarray(
+            [
+                [
+                    float(m)
+                    for m in [
+                        parse.parse(sig_name_template, k).named[x_var],
+                        parse.parse(sig_name_template, k).named[y_var],
+                    ]
+                ]
+                + v.get(channel, np.array([0.0] * channel_bins[channel])).tolist()
+                for k, v in data.items()
+            ]
+        )
+        x_min, x_max = min(rel_systs[:, 0]), max(rel_systs[:, 0])
+        y_min, y_max = min(rel_systs[:, 1]), max(rel_systs[:, 1])
+        x, y = np.mgrid[x_min:x_max:100j, y_min:y_max:100j]
+        for jbin in range(2, 2 + channel_bins[channel]):
+            f = plt.figure(figsize=(12, 6))
+            ax = f.add_subplot(1, 1, 1)
+            if x_label is not None:
+                ax.set_xlabel(x_label, fontsize=20)
+            if y_label is not None:
+                ax.set_ylabel(y_label, fontsize=20)
+            ax.set_xlim(x_min - 25, x_max + 25)
+            ax.set_ylim(y_min - 25, y_max + 25)
+            bin_number = jbin - 1
+            rel_systs = rel_systs[
+                rel_systs[:, jbin] != 0
+            ]  # Remove any points with zero relative syst
+
+            if len(rel_systs) > 0:
+                z = scipy.interpolate.griddata(
+                    rel_systs[:, :2], rel_systs[:, jbin], (x, y)
+                )
+
+                vmin, vmax = 0, v_max
+                ax.scatter(
+                    rel_systs[:, 0],
+                    rel_systs[:, 1],
+                    c=rel_systs[:, jbin],
+                    edgecolors="w",
+                    vmin=vmin,
+                    vmax=vmax,
+                )
+                im = ax.contourf(x, y, z, levels=np.linspace(vmin, vmax, 100))
+                cb = plt.colorbar(im, ax=ax)
+                cb.set_label(label="$\oplus$ (histosys, normsys, staterr)", fontsize=18)
+                if channel_bins[channel] < 2:
+                    ax.set_title(channel_names[channel], fontsize=20)
+                else:
+                    ax.set_title(
+                        f"{channel_names[channel]} (Bin {bin_number})", fontsize=20
+                    )
+
+                outliers_chan = np.asarray(
+                    [
+                        [
+                            float(parse.parse(sig_name_template, o[0]).named[x_var]),
+                            float(parse.parse(sig_name_template, o[0]).named[y_var]),
+                        ]
+                        + [o[-2]]
+                        for o in outliers
+                        if o[1] == channel and o[2] == jbin - 2
+                    ]
+                )
+
+                if outliers_chan.shape[0]:
+                    ax.scatter(
+                        outliers_chan[:, 0],
+                        outliers_chan[:, 1],
+                        c=outliers_chan[:, 2],
+                        vmin=0,
+                        vmax=20,
+                        cmap="cool",
+                    )
+                for o in outliers_chan:
+                    ax.text(o[0] + 5, o[1] + 5, "{:.2f}".format(o[2]), c="r")
+
+                plt.tight_layout()
+                plt.savefig(f"Plots/{channel_names[channel]}_bin{bin_number}.png")
+                plt.close()
+
+
+if __name__ == "__main__":
+    outlier_plot()
diff --git a/setup.py b/setup.py
@@ -17,6 +17,7 @@
         "black",
         "twine",
     ],
+    "test": ["matplotlib",],
 }
 extras_require["complete"] = sorted(set(sum(extras_require.values(), [])))
 

diff --git a/tests/test_syst_validation.sh b/tests/test_syst_validation.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Script to test validate_systs.py with test input located in test_syst_validation_input. This script has been tested in the python:3.7 docker container with pyhf, parse, and matplotlib installed
+# Note: this script is assumed to be run from one level above the directory containing the script.
+
+cd tests || exit
+
+# Download the published likelihoods from the sbottom publication
+wget -O sbottom.tar.gz https://www.hepdata.net/record/resource/997020?view=true
+
+# Grab the likelihoods from one of the analysis regions
+tar -xvzf sbottom.tar.gz RegionA
+rm sbottom.tar.gz
+
+# Change the name of the dir to something more sensible for the validation output
+mv RegionA test_syst_validation_dir
+cd test_syst_validation_dir || exit
+mkdir -p Plots
+
+# Run the validate_systs.py script on the test sbottom likelihoods
+python ../../scripts/validate_systs.py --signal_template "sbottom_{a}_{b}_{c}" --x_var a --y_var b --v_max 4 --x_label 'x label' --y_label 'y label'