Threat-Intelligence-Service/src/montecarlo.py

"""
    Monte Carlo Simulation Script

    This script runs a Monte Carlo simulation for an organisation of a given
    size and industry, utilising the most relevant available available.

    Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
"""

import os
import sys
import argparse
import pickle
import logging as log

from typing import Tuple, Dict, Union

import random
import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from graph import GraphInterface as gi

# Used for logging, equivalent to `logging.INFO`.
SUCCESS = 20

# If not specified, the default number of Monte Carlo simulation runs to perform.
DEFAULT_RUNS = 5000

# The arbitrary maximum number of incidents that an organisation can experience
# in a year.
MAX_ANNUAL_INCIDENTS = 8000

# The maximum value of a company; any yearly losses over result in a bankruptcy
COMPANY_VALUE = 100000

# A smaller value increases the curviness of the loss exeedence curve.
# Less than 30 starts to get a bit steppy though.
LEC_PRECISION = math.floor(COMPANY_VALUE / 30)

# Quantifies the quantitative boundaries for human-readable incident frequencies,
# which many sources (e.g., the CSBS 2020) use to present their results.
#
# 'None' = 0
# 'Annually' = 1
# 'Less than monthly' = 2–7
# 'Monthly' = 8–17
# 'Weekly' = 18–79
# 'Daily' = 80–399
# 'More than daily' = 400–8000
BOUNDARIES = {
    "None": 0,
    "Once per year": 1,
    "Less than once a month": 2,
    "Once a month": 8,
    "Once a week": 18,
    "Once a day": 80,
    "Several times a day": 400,
    "MAX": MAX_ANNUAL_INCIDENTS,
}

N = None
OUTPUT_DIR = None
IMAGES = None
FORCE = None


def _calculate_num_of_incidents(incidents_dist: Dict[float, float]) -> float:
    """Calculate how many incidents have occurred in a given year."""

    log.debug("Incident distribution: %s", str(incidents_dist))

    num_of_incidents = incidents_dist["b"] / (1 - np.random.uniform()) ** (
        1 / incidents_dist["a"]
    )
    log.debug("Number of incidents (as `int`): %s", str(int(num_of_incidents)))

    return (
        int(num_of_incidents)
        if num_of_incidents <= MAX_ANNUAL_INCIDENTS
        else MAX_ANNUAL_INCIDENTS
    )


def _calculate_sum_cost_of_incidents(
    num_of_incidents: int, costs_dist: Dict[float, float], idx: int = None
) -> float:
    """For a list of incident numbers, calculate how much each breach cost and
    return the sum."""

    log.debug("Costs distribution: %s", str(costs_dist))

    if (N < 1000) or (N >= 1000 and idx % math.floor(N / 100) == 0):
        log.info(
            "Running Monte Carlo simulation... (%s/%s iterations)", str(idx), str(N)
        )

    if num_of_incidents == 0:
        return 0

    loc = np.log(
        costs_dist["mean"] ** 2
        / np.sqrt(costs_dist["stddev"] ** 2 + costs_dist["mean"] ** 2)
    )
    shape = np.sqrt(np.log(1 + (costs_dist["stddev"] ** 2 / costs_dist["mean"] ** 2)))

    costs = [random.lognormvariate(loc, shape) for r in range(num_of_incidents)]

    return sum(costs)


# pylint: disable=invalid-name
def _get_most_relevant_incident_frequency_distribution(
    pairing: Tuple = ("All", "All")
) -> Union[Dict[float, float], None]:
    """Gets the distribution for incident frequency from the data in the Neo4j
    graph database."""

    log.info(
        "Finding most relevant incident frequency distribution for %s...", str(pairing)
    )
    return gi.get_incident_frequency_distribution(pairing)


# pylint: enable=invalid-name


def _get_most_relevant_incident_costs_distribution(
    pairing: Tuple = ("All", "All")
) -> Union[Dict[float, float], None]:
    """Gets the distribution for incident costs from the data in the Neo4j
    graph database."""

    log.info(
        "Finding most relevant incident costs distribution for %s...", str(pairing)
    )
    return gi.get_incident_costs_distribution(pairing)


def _get_most_relevant_distributions(
    pairing: Tuple = ("All", "All")
) -> Dict[Union[Dict[float, float], None], Union[Dict[float, float], None]]:
    """Generate (or retrieve) a population of annual incident quantities and a
    distribution of incident-with-outcome cost values."""

    # -- caching --
    # Retrieves previously-calculated values if possible
    if not FORCE and OUTPUT_DIR is not None:
        try:
            filename = "{}-{}.pickle".format(pairing[0], pairing[1])
            dists = pickle.load(open(OUTPUT_DIR + filename, "rb"))

            log.info("Previously-calculated distributions found")
            return dists["incidents"], dists["costs"]
        except (OSError, IOError):
            log.info("Previously-calculated distributions not found")

    # Otherwise, generates fresh ones
    gi.__init__()

    incidents_dist = _get_most_relevant_incident_frequency_distribution(pairing)
    costs_dist = _get_most_relevant_incident_costs_distribution(pairing)

    log.debug(
        "Returned values are: incidents_dist = %s, costs_dist = %s",
        str(incidents_dist),
        str(costs_dist),
    )

    # Saves the figures for faster analysis in future
    if OUTPUT_DIR is not None and incidents_dist is not None and costs_dist is not None:
        dists = {
            "incidents": incidents_dist,
            "costs": costs_dist,
        }
        filename = "{}-{}.pickle".format(pairing[0], pairing[1])
        pickle.dump(dists, open(OUTPUT_DIR + filename, "wb"))

    return incidents_dist, costs_dist


# pylint: disable=anomalous-backslash-in-string
def _run_monte_carlo_simulation(pairing: Tuple = ("All", "All")) -> None:
    """
    Runs :math:`n` simulations of a 12-month  period, calculating the number
    of incidents encountered each time and their cumulative costs.
    """
    # pylint: enable=anomalous-backslash-in-string

    # Generates both distributions
    incidents_dist, costs_dist = _get_most_relevant_distributions(pairing)

    if incidents_dist is None and costs_dist is None:
        return incidents_dist, costs_dist

    # Calculates the number of incidents suffered over $n$ simulated years
    nums_of_incidents = np.array(
        [_calculate_num_of_incidents(incidents_dist) for i in range(N)]
    )
    log.debug("Number of incidents: %s", str(nums_of_incidents))

    _label_plot(
        "Histogram of Incident Frequencies (over 12 months)",
        "Number of Incidents ($log_{10}$)",
        "Frequency",
    )
    plt.hist(
        [np.log10(i) if i > 0 else 0 for i in nums_of_incidents],
        align="left",
        bins=range(12),
    )
    _save_plot("2 - histogram of incident frequencies")

    # Calculates the annual costs for each simulated year
    log.info("Running Monte Carlo simulation... (0/%s iterations)", str(N))
    sum_costs = [
        _calculate_sum_cost_of_incidents(num_of_incidents, costs_dist, idx)
        for idx, num_of_incidents in enumerate(nums_of_incidents, start=1)
    ]
    log.info("Running Monte Carlo simulation... (%s/%s iterations)", str(N), str(N))

    _label_plot(
        "Histogram of Sum Costs (over 12 months)", "Total Cost (£)", "Frequency"
    )
    plt.ticklabel_format(style="plain")
    plt.hist(sum_costs, align="left", bins=15, range=(0, COMPANY_VALUE))
    _save_plot("4 - histogram of sum costs")

    _label_plot("Density of Sum Costs (over 12 months)", "Total Cost (£)", "Density")
    pd.Series(sum_costs).plot(kind="density")
    plt.xlim(0, COMPANY_VALUE * 2)
    plt.ticklabel_format(style="plain")
    _save_plot("5 - density of sum costs")

    # Get loss exceedance curve
    log.info("Generating loss exceedance curve")

    hist, edges = np.histogram(sum_costs, bins=LEC_PRECISION)
    cumrev = np.cumsum(hist[::-1])[::-1] * 100 / len(sum_costs)

    _label_plot(
        "Loss Exceedance Curve (Monte Carlo sim)",
        "Loss (£, 99th percentile)",
        "Chance of Loss or Greater (%)",
    )
    plt.ticklabel_format(style="plain")
    plt.xlim(0, COMPANY_VALUE)
    plt.plot(edges[:-1], cumrev)
    _save_plot("6 - lec" if IMAGES else "lec")

    log.info("Simulation complete!")

    return nums_of_incidents, sum_costs


def main():
    """Called when the script is run from the command-line"""
    # pylint: disable=global-statement
    global N, OUTPUT_DIR, IMAGES, FORCE
    # pylint: enable=global-statement

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-n",
        "--number",
        help="The number of simulations to run (default: " + str(DEFAULT_RUNS) + ")",
        type=int,
        default=DEFAULT_RUNS,
    )
    parser.add_argument(
        "-s",
        "--size",
        help="The size of the organisation to simulate (default: all)",
        type=str,
        default="All",
    )
    parser.add_argument(
        "-i",
        "--industry",
        help="The industry of the organisation to simulate (default: all)",
        type=str,
        default="All",
    )
    parser.add_argument(
        "-o",
        "--output",
        help="Specify the output directory (default: ./output/)",
        type=str,
        default=os.path.join(os.path.dirname(__file__), "output/"),
        metavar="DIRECTORY",
    )
    parser.add_argument(
        "-p",
        "--images",
        help="Output images at each step of the script (default: false, just \
            output the final LEC image)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-f",
        "--force",
        help="Force re-generation of incident and cost distributions (default: false)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-v",
        "--verbose",
        help="Verbose console output (default: false)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-d",
        "--debug",
        help="Show debug console output (default: false)",
        action="store_true",
        default=False,
    )

    args = parser.parse_args()

    N = args.number
    OUTPUT_DIR = args.output
    IMAGES = args.images
    FORCE = args.force

    size = args.size
    industry = args.industry

    if args.debug:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
        log.info("Debug output.")
    elif args.verbose:
        log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
        log.info("Verbose output.")
    else:
        log.basicConfig(format="%(levelname)s: %(message)s")

    if not os.path.isdir(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    if size or industry:
        print("Running simulation for ({}, {})".format(size, industry))
        nums_of_incidents, sum_costs = _run_monte_carlo_simulation((size, industry))
        if nums_of_incidents is not None and sum_costs is not None:
            log.info(
                "Results:\nNumbers of incidents: %s\nSum costs: %s\n",
                str(nums_of_incidents),
                str(sum_costs),
            )

            avg_num_of_incidents = int(sum(nums_of_incidents) / len(nums_of_incidents))
            avg_sum_costs = sum(sum_costs) / len(sum_costs)
            log.log(
                SUCCESS,
                "Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
                avg_num_of_incidents,
                avg_sum_costs,
            )

            # Print output that will be picked up by game server.
            # pylint: disable=fixme
            # TODO: For some reason the results at the moment are orders of magnitude
            # too high, so for now I've plugged it by dividing both results by 100.
            # pylint: enable=fixme
            print(int(avg_num_of_incidents / 100))
            print("%.2f" % (avg_sum_costs / 100))
        else:
            log.warning("No data found.")
            print("No data found.")

    print("Running simulation for (All, All)")
    gen_nums_of_incidents, gen_sum_costs = _run_monte_carlo_simulation()
    log.info(
        "Results:\nNumbers of incidents: %s\nSum costs: %s\n",
        str(gen_nums_of_incidents),
        str(gen_sum_costs),
    )

    avg_gen_num_of_incidents = int(
        sum(gen_nums_of_incidents) / len(gen_nums_of_incidents)
    )
    avg_gen_sum_costs = sum(gen_sum_costs) / len(gen_sum_costs)
    log.log(
        SUCCESS,
        "Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
        avg_gen_num_of_incidents,
        avg_gen_sum_costs,
    )

    # Print output that will be picked up by the game server.
    print(int(avg_gen_num_of_incidents / 100))
    print("%.2f" % (avg_gen_sum_costs / 100))

    sys.exit(0)


def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
    """Apply titles and axis labels to a plot."""

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


def _save_plot(filename="untitled") -> None:
    """Save a plot and clear the figure."""

    if IMAGES:
        plt.savefig(OUTPUT_DIR + filename + ".png")
    plt.clf()


if __name__ == "__main__":
    main()