Threat-Intelligence-Service/src/regenerate_distributions.py

"""
    Distributions (Re)generation Script

    This script generates likelihood and cost distributions based on threat
    intelligence data stored in a connected Neo4j graph database. It attempts to
    do so for every possible permutation of (size, industry) values.

    These are then consumed by `montecarlo.py`, which runs a Monte Carlo
    simulation based on these figures.

    Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
"""

import os
import sys
import argparse
import warnings
import logging as log

from typing import Tuple

import itertools
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
from scipy.stats import lognorm

from graph import GraphInterface as gi

# Used for logging, equivalent to `logging.WARNING` + 1.
SUCCESS = 31

# The arbitrary maximum number of incidents that an organisation can experience
# in a year.
MAX_ANNUAL_INCIDENTS = 8000

# Quantifies the quantitative boundaries for human-readable incident frequencies,
# which many sources (e.g., the CSBS 2020) use to present their results.
#
# 'None' = 0
# 'Annually' = 1
# 'Less than monthly' = 2–7
# 'Monthly' = 8–17
# 'Weekly' = 18–79
# 'Daily' = 80–399
# 'More than daily' = 400–8000
BOUNDARIES = {
    "None": 0,
    "Once per year": 1,
    "Less than once a month": 2,
    "Once a month": 8,
    "Once a week": 18,
    "Once a day": 80,
    "Several times a day": 400,
    "MAX": MAX_ANNUAL_INCIDENTS,
}

OUTPUT_DIR = None
IMAGES = None

# pylint: disable=invalid-name,anomalous-backslash-in-string
def _generate_new_incident_frequency_distribution(pairing: Tuple = (None, None)) -> int:
    """
    Generates a new incident frequency distribution.

    Notes
    -----

    (Re)generates the incident frequency distribution for a
    :math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
    a Neo4j graph database.

    Currently this only produces log-normal distributions. Additional types of
    distribution can be implemented by overloading this method (by importing the
    `multipledispatch` package) and returning the values required for defining
    that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
    and :math:`b`).
    """
    # pylint: enable=anomalous-backslash-in-string

    log.info("Generating new incident frequency distribution for '%s'...", str(pairing))

    # Attempts to get the incident probabilities for the pairing from the graph
    # database
    incident_frequency_probabilities = gi.get_incident_frequency_probabilities(
        list(BOUNDARIES.values())[:-1], pairing
    )
    if incident_frequency_probabilities is None:
        log.info(
            "No incident frequency distribution generated for '%s'.",
            str(pairing),
        )
        return 0

    log.debug(
        "Returned values are: incident frequency probabilities = %s",
        str(incident_frequency_probabilities),
    )

    # If values are found, generate a distribution
    Fs = np.cumsum(incident_frequency_probabilities)

    xs = np.log(list(BOUNDARIES.values())[1:])
    ys = np.log(1 - Fs)
    data = pd.DataFrame(xs, ys)

    # pylint: disable=line-too-long
    # See <https://www.statsmodels.org/stable/_modules/statsmodels/stats/stattools.html#omni_normtest> for explanation
    # pylint: enable=line-too-long
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fit = smf.ols(formula="ys ~ xs", data=data).fit()
        log.debug(fit.summary())

    # Get the parameters for the generated distribution and store them in the
    # graph database.
    alogb = fit.params[0]
    a = -fit.params[1]
    b = np.exp(alogb / a)

    gi.create_incident_frequency_distribution_node(pairing, a, b)

    log.log(
        SUCCESS,
        "New incident frequency distribution successfully generated for '%s'.",
        str(pairing),
    )
    return 1


# pylint: enable=invalid-name

# pylint: disable=anomalous-backslash-in-string
def _generate_new_incident_costs_distribution(pairing: Tuple = (None, None)) -> int:
    """
    (Re)generates the incident cost distribution for a
    :math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
    a Neo4j graph database.

    Currently this only produces log-normal distributions. Additional types of
    distribution can be implemented by overloading this method (by importing the
    `multipledispatch` package) and returning the values required for defining
    that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
    and :math:`b`).
    """
    # pylint: enable=anomalous-backslash-in-string

    # Plots the distribution for the average cost of incident(s) over 12 months
    log.info("Generating new incident cost distribution for '%s'...", str(pairing))

    incident_mean_cost, incident_median_cost = gi.get_incident_cost_averages(pairing)
    if incident_mean_cost is None or incident_median_cost is None:
        log.info(
            "No incident costs distribution generated for '%s'.",
            str(pairing),
        )
        return 0

    log.debug(
        "Returned values are: mean = %s, median = %s",
        str(incident_mean_cost),
        str(incident_median_cost),
    )

    log_stddev = np.sqrt(
        2
        * (
            np.log(incident_mean_cost) - 0
            if (incident_median_cost == 0)
            else np.log(incident_median_cost)
        )
    )
    stddev = np.exp(1) ** log_stddev

    _label_plot(
        "Average annual incident-with-outcome cost distribution", "Cost (£)", "Density"
    )
    plt.plot(
        [
            lognorm.pdf(
                np.log(i),
                np.log(incident_mean_cost),
                np.log(incident_median_cost) if incident_median_cost > 0 else 0,
            )
            for i in range(1, 2500)
        ]
    )
    _save_plot("3 - cost dist")

    gi.create_incident_costs_distribution_node(pairing, incident_mean_cost, stddev)

    log.log(
        SUCCESS,
        "New incident costs distribution successfully generated for '%s'.",
        str(pairing),
    )
    return 1


def _generate_new_distributions(pairing: Tuple = (None, None)) -> Tuple:
    """(Re)generates the cost and likelihood distributions."""

    gi.__init__()

    log.info("Existing distributions deleted: %s", bool(gi.delete_distributions()))

    successful_incidents_dists = 0
    successful_costs_dists = 0

    # If either size or industry is unspecified, gets all possible values.
    sizes = gi.get_sizes() if pairing[0] is None else [pairing[0]]
    industries = gi.get_industries() if pairing[1] is None else [pairing[1]]

    # Attempts to generate new distributions for every combination of size and
    # industry values.
    for pair in list(itertools.product(sizes, industries)):
        successful_incidents_dists += _generate_new_incident_frequency_distribution(
            pair
        )
        successful_costs_dists += _generate_new_incident_costs_distribution(pair)

    return successful_incidents_dists, successful_costs_dists


def main():
    """Called when the script is run from the command-line."""
    # pylint: disable=global-statement
    global OUTPUT_DIR, IMAGES
    # pylint: enable=global-statement

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-s",
        "--size",
        help="Specify the org. size (default: None)",
        choices=["micro", "small", "medium", "large"],
        type=str,
        default=None,
    )
    parser.add_argument(
        "-i",
        "--industry",
        help="Specify the org. industry SIC code (top-level only, e.g. ‘C’ for "
        "Manufacturing’) (default: None)",
        choices=list(map(chr, range(65, 86))),
        type=chr,
        default=None,
    )
    parser.add_argument(
        "-o",
        "--output",
        help="Specify the output directory (default: ./output/)",
        type=str,
        default=os.path.join(os.path.dirname(__file__), "output/"),
        metavar="DIRECTORY",
    )
    parser.add_argument(
        "-p",
        "--images",
        help="Output images at each step of the script (default: false, just "
        "output the final LEC image)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-v",
        "--verbose",
        help="Verbose console output (default: false)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-d",
        "--debug",
        help="Show debug console output (default: false)",
        action="store_true",
        default=False,
    )

    args = parser.parse_args()

    OUTPUT_DIR = args.output
    IMAGES = args.images

    size = args.size
    industry = args.industry

    if args.debug:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
        log.info("Debug output.")
    elif args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
        log.info("Verbose output.")
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

    if not os.path.isdir(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    incidents_dists, costs_dists = _generate_new_distributions((size, industry))

    log.log(
        SUCCESS,
        "Successfully generated %s incident frequency distributions and %s "
        "incident costs distributions!",
        str(incidents_dists),
        str(costs_dists),
    )

    sys.exit(0)


def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
    """Apply titles and axis labels to a plot."""

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


def _save_plot(filename="untitled") -> None:
    """Save a plot and clear the figure."""

    if IMAGES:
        plt.savefig(OUTPUT_DIR + filename + ".png")
    plt.clf()


if __name__ == "__main__":
    main()