initial commit

2021-04-02 11:10:12 +01:00 · 2021-04-02 11:10:12 +01:00 · 51deec9859
commit 51deec9859
14 changed files with 3059 additions and 0 deletions
--- a/src/graph.py
+++ b/src/graph.py
@ -0,0 +1,696 @@
+"""
+    Neo4j Graph Database Interface
+
+    This module defines:
+        a) the interface for interacting with the Neo4j graph database; and
+        b) subclasses of `Relationship`.
+"""
+
+import re
+import sys
+import logging as log
+from typing import List, Tuple, Union, Dict
+from datetime import datetime
+from py2neo import Graph, Node, NodeMatcher, Relationship, DatabaseError
+import numpy as np
+
+
+class GraphInterface:
+    """
+    An interface for the Neo4j graph database used to hold TI data.
+
+    This interface abstracts out the actual transactions, allowing a user
+    to use more friendly methods without worrying about the implementation or
+    learning the Cypher query language syntax.
+
+    This class should:
+        a) determine the correct transactions to use based on the called
+           method and any arguments;
+        b) return only `Node`s, `Relationship`s, `SubGraph`s or lists thereof,
+           so that the values can be assigned to subclasses of those at the
+           point of calling; and
+        c) deal with any `Exception`s, but not issues like returning 0 results,
+           which should be dealt with at the point of calling.
+    """
+
+    g: Graph = None
+
+    @staticmethod
+    def __init__():
+        try:
+            if GraphInterface.g is None:
+                GraphInterface.g = Graph(password="test")
+                log.info("Neo4j database connection opened successfully.")
+            else:
+                log.warning(
+                    "Neo4j database already connected - this branch "
+                    "shouldn't have been hit though!"
+                )
+        except DatabaseError:
+            log.error("ERR: Neo4j database connection not successfully opened.")
+            sys.exit()
+
+    @staticmethod
+    def delete_distributions() -> bool:
+        """Deletes any pre-existing distributions."""
+        GraphInterface.g.run(
+            "MATCH (n) "
+            "WHERE n:IncidentFrequencyDistribution OR n:IncidentCostsDistribution "
+            "DETACH DELETE n;"
+        )
+        return True
+
+    @staticmethod
+    def get_incident_frequency_probabilities(
+        boundaries, pairing: Tuple = ("All", "All")
+    ) -> List[float]:
+        """
+        Attempts to get a list of probabilities for different annual incident
+        frequencies, specific to the organisational details provided.
+
+        It first gets (the average of) any sets of base frequencies, then looks
+        up the provided size/industry values to see if they have any assigned
+        breach probability values in the graph database. If multiple values are
+        found, the average is taken.
+
+        Once the specific base (i.e., >0) probability is found, it then recalculates
+        the overall set of probabilities as proportions of that base figure.
+        """
+        size = pairing[0]
+        industry = pairing[1]
+
+        log.info(
+            "Attempting to get breach frequency probabilities specific to ('%s', '%s')...",
+            size,
+            industry,
+        )
+
+        base_frequency_probabilities_nodes = GraphInterface._get_nodes(
+            "IncidentBaseFrequencyProbabilities"
+        )
+        base_frequency_probabilities = [
+            node["probabilities"]
+            for node in base_frequency_probabilities_nodes
+            if len(node["probabilities"]) == (len(boundaries) - 1)
+        ]
+
+        # If there are >1 sets of likelihoods, gets the mean for each boundary value.
+        if len(base_frequency_probabilities) > 1:
+            log.info("Multiple sets of base frequencies found, averaging...")
+            base_frequency_probabilities = np.array(base_frequency_probabilities)
+            base_frequency_probabilities = [
+                np.mean(base_frequency_probabilities[:, i])
+                for i in range(len(boundaries))
+            ]
+
+        probability_of_breach = GraphInterface.get_probability_of_breach(size, industry)
+        if probability_of_breach:
+            log.info(
+                "Found specific >0 breaches probability value for one or both "
+                "of ('%s', '%s'), calculating follow-on values...",
+                size,
+                industry,
+            )
+            # Sets the probability of having 0 breaches.
+            breach_frequency_probabilities = [(100 - probability_of_breach) / 100]
+
+            # Calculates the remaining probabilities proportional to the sum
+            # >0 breaches probability.
+            for base_frequency_probability in base_frequency_probabilities[0]:
+                breach_frequency_probabilities.append(
+                    (probability_of_breach * base_frequency_probability) / 100
+                )
+
+            if len(breach_frequency_probabilities) != len(boundaries):
+                raise Exception("Mismatched boundaries!")
+
+            return breach_frequency_probabilities
+
+        log.info("No breach probability value found.")
+        return None
+
+    # pylint: disable=too-many-branches,too-many-locals,too-many-statements
+    @staticmethod
+    def get_probability_of_breach(size="All", industry="All") -> float:
+        """
+        Returns the probability of an organisation of a given size and/or
+        industry experiencing a breach with an outcome in the next year.
+
+        Where a match exists for both size and industry, size is chosen as it
+        assumed that organisations of a similar size will have a more similar
+        threat model than organisations within the same industry. This assumption
+        is not empirically grounded, however, so it may be that the opposite
+        is true.
+        """
+        size_probability = None
+        industry_probability = None
+
+        size_node = GraphInterface._get_node("Size", name=size)
+        if size_node:
+            log.info("Found node for size '%s'.", size)
+        else:
+            log.info("No node found for size '%s'.", size)
+
+        industry_node = GraphInterface._get_node("Industry", name=industry)
+        if industry_node:
+            log.info("Found node for industry '%s'.", industry)
+        else:
+            log.info("No node found for industry '%s'.", industry)
+
+        # If no figures were found for this pairing, returns None.
+        if size_node is None and industry_node is None:
+            return None
+
+        if size_node:
+            size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
+
+            size_probabilities = []
+            for rel in size_relations:
+                if rel.start_node.has_label("IncidentProbability"):
+                    size_probabilities.append(rel.start_node["probability"])
+
+            if len(size_probabilities) > 1:
+                log.info(
+                    "Multiple probabilities found for size '%s', averaging...", size
+                )
+                size_probability = sum(size_probabilities) / len(size_probabilities)
+            elif len(size_probabilities) == 1:
+                log.info("Probability value found for size '%s'.", size)
+                size_probability = size_probabilities[0]
+            else:
+                log.info("No probability value found for size '%s'.", size)
+
+        if industry_node:
+            industry_relations = GraphInterface.g.match(
+                {industry_node}, r_type=FOR_INDUSTRY
+            )
+            industry_probabilities = []
+            for rel in industry_relations:
+                if rel.start_node.has_label("IncidentProbability"):
+                    industry_probabilities.append(rel.start_node["probability"])
+
+            if len(industry_probabilities) > 1:
+                log.info(
+                    "Multiple probabilities found for industry '%s', averaging...",
+                    industry,
+                )
+                industry_probability = sum(industry_probabilities) / len(
+                    industry_probabilities
+                )
+            elif len(industry_probabilities) == 1:
+                log.info("Probability value found for industry '%s'.", industry)
+                industry_probability = industry_probabilities[0]
+            else:
+                log.info("No probability value found for industry '%s'.", industry)
+
+        if size_probability and industry_probability:
+            log.info(
+                "Probabilities found for both size '%s' and industry '%s', averaging...",
+                size,
+                industry,
+            )
+            probability = (size_probability + industry_probability) / 2
+        else:
+            probability = size_probability or industry_probability
+
+        return probability
+
+    # pylint: enable=too-many-branches,too-many-locals,too-many-statements
+
+    # pylint: disable=too-many-branches,too-many-locals,too-many-statements
+    @staticmethod
+    def get_incident_cost_averages(
+        pairing: Tuple = ("All", "All")
+    ) -> Tuple[float, float]:
+        """
+        Attempts to get the average incident costs over a year, specific to the
+        organisational details provided.
+
+        The CSBS specifies figures for breaches both 'with' and 'without outcomes'.
+        We have ignored the latter here.
+        """
+        size = pairing[0]
+        industry = pairing[1]
+
+        size_mean = None
+        size_median = None
+        industry_mean = None
+        industry_median = None
+
+        log.info(
+            "Attempting to get incident cost averages specific to ('%s', '%s')...",
+            size,
+            industry,
+        )
+
+        size_node = GraphInterface._get_node("Size", name=size)
+        if size_node:
+            log.info("Found node for size '%s'.", size)
+        else:
+            log.info("No node found for size '%s'.", size)
+
+        industry_node = GraphInterface._get_node("Industry", name=industry)
+        if industry_node:
+            log.info("Found node for industry '%s'.", industry)
+        else:
+            log.info("No node found for industry '%s'.", industry)
+
+        # If no figures were found for this pairing, returns None.
+        if size_node is None and industry_node is None:
+            return None
+
+        if size_node:
+            size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
+
+            size_means = []
+            size_medians = []
+            for rel in size_relations:
+                if rel.start_node.has_label("IncidentCostAverages"):
+                    size_means.append(rel.start_node["mean"])
+                    size_medians.append(rel.start_node["median"])
+
+            # Converts however many mean and median values returned into one of
+            # each.
+            if len(size_means) > 1:
+                log.info("Multiple mean values found for size '%s', averaging...", size)
+                size_mean = sum(size_means) / len(size_means)
+            elif len(size_means) == 1:
+                log.info("Mean value found for size '%s'.", size)
+                size_mean = size_means[0]
+            else:
+                log.info("No mean values found for size '%s'.", size)
+            if len(size_medians) > 1:
+                log.info(
+                    "Multiple median values found for size '%s', averaging...", size
+                )
+                size_median = sum(size_medians) / len(size_medians)
+            elif len(size_medians) == 1:
+                log.info("Median value found for size '%s'.", size)
+                size_median = size_medians[0]
+            else:
+                log.info("No median values found for size '%s'.", size)
+
+        if industry_node:
+            industry_relations = GraphInterface.g.match(
+                {industry_node}, r_type=FOR_INDUSTRY
+            )
+
+            industry_means = []
+            industry_medians = []
+            for rel in industry_relations:
+                if rel.start_node.has_label("IncidentCostAverages"):
+                    industry_means.append(rel.start_node["mean"])
+                    industry_medians.append(rel.start_node["median"])
+
+            # Converts however many mean and median values returned into one of
+            # each.
+            if len(industry_means) > 1:
+                log.info(
+                    "Multiple mean values found for industry '%s', averaging...",
+                    industry,
+                )
+                industry_mean = sum(industry_means) / len(industry_means)
+            elif len(industry_means) == 1:
+                log.info("Mean value found for industry '%s'.", industry)
+                industry_mean = industry_means[0]
+            else:
+                log.info("No mean values found for industry '%s'.", industry)
+            if len(industry_medians) > 1:
+                log.info(
+                    "Multiple median values found for industry '%s', averaging...",
+                    industry,
+                )
+                industry_median = sum(industry_medians) / len(industry_medians)
+            elif len(industry_medians) == 1:
+                log.info("Median value found for industry '%s'.", industry)
+                industry_median = industry_medians[0]
+            else:
+                log.info("No median values found for industry '%s'.", industry)
+
+        if size_mean and industry_mean:
+            log.info(
+                "Mean values found for both size '%s' and industry '%s', averaging...",
+                size,
+                industry,
+            )
+            mean = (size_mean + industry_mean) / 2
+        else:
+            mean = size_mean or industry_mean
+
+        if size_median and industry_median:
+            log.info(
+                "Median values found for both size '%s' and industry '%s', averaging...",
+                size,
+                industry,
+            )
+            median = (size_median + industry_median) / 2
+        else:
+            median = size_median or industry_median
+
+        return mean, median
+
+    # pylint: enable=too-many-branches,too-many-locals,too-many-statements
+
+    # pylint: disable=invalid-name,anomalous-backslash-in-string
+    @staticmethod
+    def get_incident_frequency_distribution(
+        pairing: Tuple = ("All", "All")
+    ) -> Union[Tuple[float, float], None]:
+        """
+        Returns the most relevant available incident frequency distribution for
+        a given pairing.
+
+        The algorithm for determining this is currently very basic:
+
+        1. search for an exact match for the pairing, and return that if found; else
+        2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
+
+        In future, this can and should be expanded to follow complex heuristics
+        for similarity (and some relationships for doing so are provided at the
+        end of this module). For example, two industries can be joined using the
+        SIMILAR_TO relationship, which would allow the algorithm to traverse
+        laterally to other leaf nodes.
+
+        An even simpler improvement would be to add handling for partial matches
+        (e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
+        should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
+        organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
+        values will be.
+        """
+        # pylint: enable=anomalous-backslash-in-string
+
+        size = pairing[0]
+        industry = pairing[1]
+
+        size_node = GraphInterface._get_node("Size", name=size)
+        if size_node:
+            log.info("Found node for size '%s'.", size)
+        else:
+            log.info("No node found for size '%s'.", size)
+
+        industry_node = GraphInterface._get_node("Industry", name=industry)
+        if industry_node:
+            log.info("Found node for industry '%s'.", industry)
+        else:
+            log.info("No node found for industry '%s'.", industry)
+
+        # If no figures were found for this pairing, returns the fallback values.
+        if size_node is None and industry_node is None:
+            return GraphInterface._get_frequency_distribution()
+
+        dist: Union[
+            Dict[float, float], None
+        ] = GraphInterface._get_frequency_distribution(size, industry)
+
+        if dist is not None:
+            log.debug(
+                "Returned values are: a = %s, b = %s", str(dist["a"]), str(dist["b"])
+            )
+
+        return dist
+
+    # pylint: enable=invalid-name
+
+    # pylint: disable=anomalous-backslash-in-string
+    @staticmethod
+    def get_incident_costs_distribution(
+        pairing: Tuple = ("All", "All")
+    ) -> Union[Tuple[float, float], None]:
+        """
+        Returns the most relevant available incident costs distribution for
+        a given pairing.
+
+        The algorithm for determining this is currently very basic:
+
+        1. search for an exact match for the pairing, and return that if found; else
+        2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
+
+        In future, this can and should be expanded to follow complex heuristics
+        for similarity (and some relationships for doing so are provided at the
+        end of this module). For example, two industries can be joined using the
+        SIMILAR_TO relationship, which would allow the algorithm to traverse
+        laterally to other leaf nodes.
+
+        An even simpler improvement would be to add handling for partial matches
+        (e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
+        should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
+        organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
+        values will be.
+        """
+        # pylint: enable=anomalous-backslash-in-string
+
+        size = pairing[0]
+        industry = pairing[1]
+
+        size_node = GraphInterface._get_node("Size", name=size)
+        if size_node:
+            log.info("Found node for size '%s'.", size)
+        else:
+            log.info("No node found for size '%s'.", size)
+
+        industry_node = GraphInterface._get_node("Industry", name=industry)
+        if industry_node:
+            log.info("Found node for industry '%s'.", industry)
+        else:
+            log.info("No node found for industry '%s'.", industry)
+
+        # If no figures were found for this pairing, returns the fallback values.
+        if size_node is None and industry_node is None:
+            return GraphInterface._get_costs_distribution()
+
+        dist: Union[Dict[float, float], None] = GraphInterface._get_costs_distribution(
+            size, industry
+        )
+
+        if dist is not None:
+            log.debug(
+                "Returned values are: mean = %s, stddev = %s",
+                str(dist["mean"]),
+                str(dist["stddev"]),
+            )
+
+        return dist
+
+    @staticmethod
+    def get_sizes() -> List[str]:
+        """Returns a list of all of the organisation size values."""
+        nodes = GraphInterface._get_nodes("Size")
+
+        return [node["name"] for node in nodes]
+
+    @staticmethod
+    def get_industries() -> List[str]:
+        """Returns a list of all of the organisation industry values."""
+        nodes = GraphInterface._get_nodes("Industry")
+
+        return [node["name"] for node in nodes]
+
+    @staticmethod
+    def get_sizes_and_industries() -> Tuple[list, list]:
+        """Returns all available organisation size and industry values."""
+        return GraphInterface.get_sizes(), GraphInterface.get_industries()
+
+    # pylint: disable=invalid-name
+    @staticmethod
+    def create_incident_frequency_distribution_node(
+        pairing: Tuple, a: float, b: float
+    ) -> Node:
+        """Adds an `IncidentFrequencyDistribution` node to the Neo4j graph database."""
+        size_node = GraphInterface._get_node("Size", name=pairing[0])
+        industry_node = GraphInterface._get_node("Industry", name=pairing[1])
+
+        node = GraphInterface._create_node(
+            "IncidentFrequencyDistribution", a=a, b=b, calculated_at=datetime.now()
+        )
+        GraphInterface._create_relationship(node, FOR_SIZE, size_node)
+        GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
+        return node
+
+    # pylint: enable=invalid-name
+
+    @staticmethod
+    def create_incident_costs_distribution_node(
+        pairing: Tuple, mean: float, stddev: float
+    ) -> Node:
+        """Adds an `IncidentCostsDistribution` node to the Neo4j graph database."""
+        size_node = GraphInterface._get_node("Size", name=pairing[0])
+        industry_node = GraphInterface._get_node("Industry", name=pairing[1])
+
+        node = GraphInterface._create_node(
+            "IncidentCostsDistribution",
+            mean=mean,
+            stddev=stddev,
+            calculated_at=datetime.now(),
+        )
+        GraphInterface._create_relationship(node, FOR_SIZE, size_node)
+        GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
+        return node
+
+    # pylint: disable=anomalous-backslash-in-string,invalid-name
+    @staticmethod
+    def _get_frequency_distribution(
+        size: str = "All", industry: str = "All"
+    ) -> Dict[float, float]:
+        """
+        Returns the :math:`a` and :math:`b` values from the requested incident
+        frequency distribution node (if it exists). Call with no arguments to
+        use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
+        """
+        # pylint: enable=anomalous-backslash-in-string
+
+        # pylint: disable=line-too-long
+        result = GraphInterface.g.run(
+            "MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentFrequencyDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
+            "RETURN node;".format(size, industry)
+        )
+        # pylint: enable=line-too-long
+
+        nodes = [record["node"] for record in result]
+
+        if len(nodes) == 0:
+            # There should always be a (All, All) distribution at least.
+            if size == "All" and industry == "All":
+                raise Exception("No fallback node found!")
+
+            log.debug(
+                "No incident frequency distribution found for (%s, %s).",
+                str(size),
+                str(industry),
+            )
+            return None, None
+        log.debug("Results: %s", str(nodes))
+
+        a = [node["a"] for node in nodes]
+        b = [node["b"] for node in nodes]
+
+        if len(nodes) > 0:
+            log.info("Multiple fallback nodes found, averaging parameters...")
+            a = sum(a) / len(a)
+            b = sum(b) / len(b)
+        else:
+            a = a[0]
+            b = b[0]
+
+        return {"a": a, "b": b}
+
+    # pylint: enable=invalid-name
+
+    # pylint: disable=anomalous-backslash-in-string
+    @staticmethod
+    def _get_costs_distribution(
+        size: str = "All", industry: str = "All"
+    ) -> Dict[float, float]:
+        """
+        Returns the :math:`a` and :math:`b` values from the requested incident
+        frequency distribution node (if it exists). Call with no arguments to
+        use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
+        """
+        # pylint: enable=anomalous-backslash-in-string
+
+        # pylint: disable=line-too-long
+        result = GraphInterface.g.run(
+            "MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentCostsDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
+            "RETURN node;".format(size, industry)
+        )
+        # pylint: enable=line-too-long
+
+        nodes = [record["node"] for record in result]
+
+        if len(nodes) == 0:
+            # There should always be a (All, All) distribution at least.
+            if size == "All" and industry == "All":
+                raise Exception("No fallback node found!")
+
+            log.debug(
+                "No incident frequency distribution found for (%s, %s).",
+                str(size),
+                str(industry),
+            )
+            return None, None
+        log.debug("Results: %s", str(nodes))
+
+        mean = [node["mean"] for node in nodes]
+        stddev = [node["stddev"] for node in nodes]
+
+        if len(nodes) > 1:
+            log.info("Multiple fallback nodes found, averaging parameters...")
+            mean = sum(mean) / len(mean)
+            stddev = sum(stddev) / len(stddev)
+        else:
+            mean = mean[0]
+            stddev = stddev[0]
+
+        return {"mean": mean, "stddev": stddev}
+
+    # pylint: disable=invalid-name
+    @staticmethod
+    def _create_node(*labels, **properties) -> Node:
+        """Creates a new node in the Neo4j graph database."""
+        tx = GraphInterface.g.begin()
+        node = Node(*labels, **properties)
+        tx.create(node)
+        tx.commit()
+        return node
+
+    # pylint: enable=invalid-name
+
+    # pylint: disable=invalid-name
+    @staticmethod
+    def _create_relationship(
+        start_node, relationship, end_node, **properties
+    ) -> Relationship:
+        """Creates a new relationship in the Neo4j graph database."""
+        tx = GraphInterface.g.begin()
+        relationship = Relationship(
+            start_node, relationship.__name__, end_node, **properties
+        )
+        tx.create(relationship)
+        tx.commit()
+        return relationship
+
+    # pylint: enable=invalid-name
+
+    @staticmethod
+    def _get_node(*labels, **properties) -> Union[Node, None]:
+        """Returns a node from the Neo4j graph database."""
+        return GraphInterface.g.nodes.match(*labels, **properties).first()
+
+    @staticmethod
+    def _get_nodes(*labels, **properties) -> NodeMatcher:
+        """Returns a node from the Neo4j graph database."""
+        return GraphInterface.g.nodes.match(*labels, **properties)
+
+    @staticmethod
+    def _dict_to_jsobj(properties) -> str:
+        """Recursively converts a Python `dict` into a JS `Object`."""
+        if isinstance(properties, dict):
+            return re.sub("'([a-z_]*)':", "\\1:", str(properties))
+
+        if isinstance(properties, str):
+            return GraphInterface._dict_to_jsobj({"name": properties})
+
+        return "{}"
+
+
+# pylint: disable=invalid-name,missing-class-docstring
+class SUBSECTION_OF(Relationship):
+    pass
+
+
+class SECTION_OF(Relationship):
+    pass
+
+
+class SIMILAR_TO(Relationship):
+    pass
+
+
+class FOR_SIZE(Relationship):
+    pass
+
+
+class FOR_INDUSTRY(Relationship):
+    pass
+
+
+# pylint: enable=invalid-name,missing-class-docstring
--- a/src/montecarlo.py
+++ b/src/montecarlo.py
@ -0,0 +1,420 @@
+"""
+    Monte Carlo Simulation Script
+
+    This script runs a Monte Carlo simulation for an organisation of a given
+    size and industry, utilising the most relevant available available.
+
+    Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
+"""
+
+import os
+import sys
+import argparse
+import pickle
+import logging as log
+
+from typing import Tuple, Dict, Union
+
+import random
+import math
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+
+from graph import GraphInterface as gi
+
+# Used for logging, equivalent to `logging.INFO`.
+SUCCESS = 20
+
+# If not specified, the default number of Monte Carlo simulation runs to perform.
+DEFAULT_RUNS = 5000
+
+# The arbitrary maximum number of incidents that an organisation can experience
+# in a year.
+MAX_ANNUAL_INCIDENTS = 8000
+
+# The maximum value of a company; any yearly losses over result in a bankruptcy
+COMPANY_VALUE = 100000
+
+# A smaller value increases the curviness of the loss exeedence curve.
+# Less than 30 starts to get a bit steppy though.
+LEC_PRECISION = math.floor(COMPANY_VALUE / 30)
+
+# Quantifies the quantitative boundaries for human-readable incident frequencies,
+# which many sources (e.g., the CSBS 2020) use to present their results.
+#
+# 'None' = 0
+# 'Annually' = 1
+# 'Less than monthly' = 2–7
+# 'Monthly' = 8–17
+# 'Weekly' = 18–79
+# 'Daily' = 80–399
+# 'More than daily' = 400–8000
+BOUNDARIES = {
+    "None": 0,
+    "Once per year": 1,
+    "Less than once a month": 2,
+    "Once a month": 8,
+    "Once a week": 18,
+    "Once a day": 80,
+    "Several times a day": 400,
+    "MAX": MAX_ANNUAL_INCIDENTS,
+}
+
+N = None
+OUTPUT_DIR = None
+IMAGES = None
+FORCE = None
+
+
+def _calculate_num_of_incidents(incidents_dist: Dict[float, float]) -> float:
+    """Calculate how many incidents have occurred in a given year."""
+
+    log.debug("Incident distribution: %s", str(incidents_dist))
+
+    num_of_incidents = incidents_dist["b"] / (1 - np.random.uniform()) ** (
+        1 / incidents_dist["a"]
+    )
+    log.debug("Number of incidents (as `int`): %s", str(int(num_of_incidents)))
+
+    return (
+        int(num_of_incidents)
+        if num_of_incidents <= MAX_ANNUAL_INCIDENTS
+        else MAX_ANNUAL_INCIDENTS
+    )
+
+
+def _calculate_sum_cost_of_incidents(
+    num_of_incidents: int, costs_dist: Dict[float, float], idx: int = None
+) -> float:
+    """For a list of incident numbers, calculate how much each breach cost and
+    return the sum."""
+
+    log.debug("Costs distribution: %s", str(costs_dist))
+
+    if (N < 1000) or (N >= 1000 and idx % math.floor(N / 100) == 0):
+        log.info(
+            "Running Monte Carlo simulation... (%s/%s iterations)", str(idx), str(N)
+        )
+
+    if num_of_incidents == 0:
+        return 0
+
+    loc = np.log(
+        costs_dist["mean"] ** 2
+        / np.sqrt(costs_dist["stddev"] ** 2 + costs_dist["mean"] ** 2)
+    )
+    shape = np.sqrt(np.log(1 + (costs_dist["stddev"] ** 2 / costs_dist["mean"] ** 2)))
+
+    costs = [random.lognormvariate(loc, shape) for r in range(num_of_incidents)]
+
+    return sum(costs)
+
+
+# pylint: disable=invalid-name
+def _get_most_relevant_incident_frequency_distribution(
+    pairing: Tuple = ("All", "All")
+) -> Union[Dict[float, float], None]:
+    """Gets the distribution for incident frequency from the data in the Neo4j
+    graph database."""
+
+    log.info(
+        "Finding most relevant incident frequency distribution for %s...", str(pairing)
+    )
+    return gi.get_incident_frequency_distribution(pairing)
+
+
+# pylint: enable=invalid-name
+
+
+def _get_most_relevant_incident_costs_distribution(
+    pairing: Tuple = ("All", "All")
+) -> Union[Dict[float, float], None]:
+    """Gets the distribution for incident costs from the data in the Neo4j
+    graph database."""
+
+    log.info(
+        "Finding most relevant incident costs distribution for %s...", str(pairing)
+    )
+    return gi.get_incident_costs_distribution(pairing)
+
+
+def _get_most_relevant_distributions(
+    pairing: Tuple = ("All", "All")
+) -> Dict[Union[Dict[float, float], None], Union[Dict[float, float], None]]:
+    """Generate (or retrieve) a population of annual incident quantities and a
+    distribution of incident-with-outcome cost values."""
+
+    # -- caching --
+    # Retrieves previously-calculated values if possible
+    if not FORCE and OUTPUT_DIR is not None:
+        try:
+            filename = "{}-{}.pickle".format(pairing[0], pairing[1])
+            dists = pickle.load(open(OUTPUT_DIR + filename, "rb"))
+
+            log.info("Previously-calculated distributions found")
+            return dists["incidents"], dists["costs"]
+        except (OSError, IOError):
+            log.info("Previously-calculated distributions not found")
+
+    # Otherwise, generates fresh ones
+    gi.__init__()
+
+    incidents_dist = _get_most_relevant_incident_frequency_distribution(pairing)
+    costs_dist = _get_most_relevant_incident_costs_distribution(pairing)
+
+    log.debug(
+        "Returned values are: incidents_dist = %s, costs_dist = %s",
+        str(incidents_dist),
+        str(costs_dist),
+    )
+
+    # Saves the figures for faster analysis in future
+    if OUTPUT_DIR is not None and incidents_dist is not None and costs_dist is not None:
+        dists = {
+            "incidents": incidents_dist,
+            "costs": costs_dist,
+        }
+        filename = "{}-{}.pickle".format(pairing[0], pairing[1])
+        pickle.dump(dists, open(OUTPUT_DIR + filename, "wb"))
+
+    return incidents_dist, costs_dist
+
+
+# pylint: disable=anomalous-backslash-in-string
+def _run_monte_carlo_simulation(pairing: Tuple = ("All", "All")) -> None:
+    """
+    Runs :math:`n` simulations of a 12-month  period, calculating the number
+    of incidents encountered each time and their cumulative costs.
+    """
+    # pylint: enable=anomalous-backslash-in-string
+
+    # Generates both distributions
+    incidents_dist, costs_dist = _get_most_relevant_distributions(pairing)
+
+    if incidents_dist is None and costs_dist is None:
+        return incidents_dist, costs_dist
+
+    # Calculates the number of incidents suffered over $n$ simulated years
+    nums_of_incidents = np.array(
+        [_calculate_num_of_incidents(incidents_dist) for i in range(N)]
+    )
+    log.debug("Number of incidents: %s", str(nums_of_incidents))
+
+    _label_plot(
+        "Histogram of Incident Frequencies (over 12 months)",
+        "Number of Incidents ($log_{10}$)",
+        "Frequency",
+    )
+    plt.hist(
+        [np.log10(i) if i > 0 else 0 for i in nums_of_incidents],
+        align="left",
+        bins=range(12),
+    )
+    _save_plot("2 - histogram of incident frequencies")
+
+    # Calculates the annual costs for each simulated year
+    log.info("Running Monte Carlo simulation... (0/%s iterations)", str(N))
+    sum_costs = [
+        _calculate_sum_cost_of_incidents(num_of_incidents, costs_dist, idx)
+        for idx, num_of_incidents in enumerate(nums_of_incidents, start=1)
+    ]
+    log.info("Running Monte Carlo simulation... (%s/%s iterations)", str(N), str(N))
+
+    _label_plot(
+        "Histogram of Sum Costs (over 12 months)", "Total Cost (£)", "Frequency"
+    )
+    plt.ticklabel_format(style="plain")
+    plt.hist(sum_costs, align="left", bins=15, range=(0, COMPANY_VALUE))
+    _save_plot("4 - histogram of sum costs")
+
+    _label_plot("Density of Sum Costs (over 12 months)", "Total Cost (£)", "Density")
+    pd.Series(sum_costs).plot(kind="density")
+    plt.xlim(0, COMPANY_VALUE * 2)
+    plt.ticklabel_format(style="plain")
+    _save_plot("5 - density of sum costs")
+
+    # Get loss exceedance curve
+    log.info("Generating loss exceedance curve")
+
+    hist, edges = np.histogram(sum_costs, bins=LEC_PRECISION)
+    cumrev = np.cumsum(hist[::-1])[::-1] * 100 / len(sum_costs)
+
+    _label_plot(
+        "Loss Exceedance Curve (Monte Carlo sim)",
+        "Loss (£, 99th percentile)",
+        "Chance of Loss or Greater (%)",
+    )
+    plt.ticklabel_format(style="plain")
+    plt.xlim(0, COMPANY_VALUE)
+    plt.plot(edges[:-1], cumrev)
+    _save_plot("6 - lec" if IMAGES else "lec")
+
+    log.info("Simulation complete!")
+
+    return nums_of_incidents, sum_costs
+
+
+def main():
+    """Called when the script is run from the command-line"""
+    # pylint: disable=global-statement
+    global N, OUTPUT_DIR, IMAGES, FORCE
+    # pylint: enable=global-statement
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-n",
+        "--number",
+        help="The number of simulations to run (default: " + str(DEFAULT_RUNS) + ")",
+        type=int,
+        default=DEFAULT_RUNS,
+    )
+    parser.add_argument(
+        "-s",
+        "--size",
+        help="The size of the organisation to simulate (default: all)",
+        type=str,
+        default="All",
+    )
+    parser.add_argument(
+        "-i",
+        "--industry",
+        help="The industry of the organisation to simulate (default: all)",
+        type=str,
+        default="All",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Specify the output directory (default: ./output/)",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "output/"),
+        metavar="DIRECTORY",
+    )
+    parser.add_argument(
+        "-p",
+        "--images",
+        help="Output images at each step of the script (default: false, just \
+            output the final LEC image)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-f",
+        "--force",
+        help="Force re-generation of incident and cost distributions (default: false)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        help="Verbose console output (default: false)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        help="Show debug console output (default: false)",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    N = args.number
+    OUTPUT_DIR = args.output
+    IMAGES = args.images
+    FORCE = args.force
+
+    size = args.size
+    industry = args.industry
+
+    if args.debug:
+        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
+        log.info("Debug output.")
+    elif args.verbose:
+        log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
+        log.info("Verbose output.")
+    else:
+        log.basicConfig(format="%(levelname)s: %(message)s")
+
+    if not os.path.isdir(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+
+    if size or industry:
+        print("Running simulation for ({}, {})".format(size, industry))
+        nums_of_incidents, sum_costs = _run_monte_carlo_simulation((size, industry))
+        if nums_of_incidents is not None and sum_costs is not None:
+            log.info(
+                "Results:\nNumbers of incidents: %s\nSum costs: %s\n",
+                str(nums_of_incidents),
+                str(sum_costs),
+            )
+
+            avg_num_of_incidents = int(sum(nums_of_incidents) / len(nums_of_incidents))
+            avg_sum_costs = sum(sum_costs) / len(sum_costs)
+            log.log(
+                SUCCESS,
+                "Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
+                avg_num_of_incidents,
+                avg_sum_costs,
+            )
+
+            # Print output that will be picked up by game server.
+            # pylint: disable=fixme
+            # TODO: For some reason the results at the moment are orders of magnitude
+            # too high, so for now I've plugged it by dividing both results by 100.
+            # pylint: enable=fixme
+            print(int(avg_num_of_incidents / 100))
+            print("%.2f" % (avg_sum_costs / 100))
+        else:
+            log.warning("No data found.")
+            print("No data found.")
+
+    print("Running simulation for (All, All)")
+    gen_nums_of_incidents, gen_sum_costs = _run_monte_carlo_simulation()
+    log.info(
+        "Results:\nNumbers of incidents: %s\nSum costs: %s\n",
+        str(gen_nums_of_incidents),
+        str(gen_sum_costs),
+    )
+
+    avg_gen_num_of_incidents = int(
+        sum(gen_nums_of_incidents) / len(gen_nums_of_incidents)
+    )
+    avg_gen_sum_costs = sum(gen_sum_costs) / len(gen_sum_costs)
+    log.log(
+        SUCCESS,
+        "Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
+        avg_gen_num_of_incidents,
+        avg_gen_sum_costs,
+    )
+
+    # Print output that will be picked up by the game server.
+    print(int(avg_gen_num_of_incidents / 100))
+    print("%.2f" % (avg_gen_sum_costs / 100))
+
+    sys.exit(0)
+
+
+def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
+    """Apply titles and axis labels to a plot."""
+
+    plt.title(title)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+
+
+def _save_plot(filename="untitled") -> None:
+    """Save a plot and clear the figure."""
+
+    if IMAGES:
+        plt.savefig(OUTPUT_DIR + filename + ".png")
+    plt.clf()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/montecarlo.r
+++ b/src/montecarlo.r
@ -0,0 +1,103 @@
+#
+# Secure Digitalisation Monte Carlo Simulation Script
+#
+# This script runs a Monto Carlo simulation using breach likelihood and cost
+# figures derived from the Cyber Security Breaches Survey 2020 (CSBS).
+#	This script is an unfinished prototype, and has since been superseded by
+# `montecarlo.py`.
+#
+# Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
+#
+
+masses = c(0.54, 0.1058, 0.1012, 0.0966, 0.069, 0.0368, 0.0414)
+boundaries = c(1, 2, 8, 18, 80, 400, 8000)
+
+Fs = cumsum(masses)
+plot(log(boundaries), log(1 - Fs))
+
+xs = log(boundaries)
+ys = log(1 - Fs)
+fit = lm(ys ~ xs)
+summary(fit)
+
+alogb = fit$coeff[1]
+a = -fit$coeff[2]
+b = exp(alogb/a)
+print(a)
+print(b)
+
+n = 10000
+
+us = runif(n)
+xs = b / (1 - us)^(1 / a)
+print()
+p0 = mean(xs < boundaries[1])
+p1 = mean(xs < boundaries[2]) - p0
+p2 = mean(xs < boundaries[3]) - p0 - p1
+p3 = mean(xs < boundaries[4]) - p0 - p1 - p2
+p4 = mean(xs < boundaries[5]) - p0 - p1 - p2 - p3
+p5 = mean(xs < boundaries[6]) - p0 - p1 - p2 - p3 - p4
+ps = c(p0, p1, p2, p3, p4, p5, 1 - (p0 + p1 + p2 + p3 + p4 + p5))
+
+print(ps)
+print(masses)
+
+nattacks = floor(xs)
+hist(log10(nattacks),
+     main = "Histogram of Number of Attacks/Breaches Over 12 Months",
+     xlab = expression("Number of Attacks (log"[10]*")"),
+     ylab = "Frequency",
+     breaks = 0:12)
+
+# Plots the distribution for the average cost of breach(es) over 12 months
+
+mean = 3230
+median = 274
+
+logstd = sqrt(2 * (log(mean) - if (median == 0) 0 else log(median)))
+std = exp(1)^logstd
+
+curve(dlnorm(x, log(mean), log(std)), from=1, to=5000,
+      main = "Average annual breach cost distribution",
+      xlab = 'Cost (£)',
+      ylab = 'Density',
+      lwd = 2)
+
+# Runs the MonteCarlo simulation
+
+simulateCosts <- function(n) {
+  return(if (n >= 1) sum(rlnorm(n, loc, shape)) else 0)
+}
+
+n = 10000
+
+loc <- log(mean^2 / sqrt(std^2 + mean^2))
+shape <- sqrt(log(1 + (std^2 / mean^2)))
+
+numAttacks <- sample(log10(nattacks), n)
+results <- sapply(numAttacks, simulateCosts)
+
+hist(results,
+     main = "Histogram of Total Costs Over 12 Months (Monte Carlo sim)",
+     xlab = "Total cost (£)")
+
+d <- density(results)
+plot(d,
+     main="Density of Total Costs Over 12 Months (Monte Carlo sim)", 
+     xlab=expression("Total Cost (£)"),
+     ylab="Density")
+
+# Get loss exceedance
+# TODO: needs to be prettier, but `evaluate::loss_exceedance_curve()` is broken
+
+maxValue = 2500
+numOver <- length(results[results > maxValue])
+risk = numOver/n
+
+plot(d,
+     main="Loss Exceedance (Monte Carlo sim)", 
+     xlab=expression("Total Cost (£)"),
+     ylab="Density")
+
+abline(v = maxValue, col="red", lwd=3, lty=2)
+text(3000, 4e-04, labels=paste(floor(risk*100), "% chance of ≥£", maxValue, " losses"), adj=c(0, 0.5))
--- a/src/neo4r_connection_test.r
+++ b/src/neo4r_connection_test.r
@ -0,0 +1,78 @@
+#
+# Secure Digitalisation Neo4j Connection Script
+#
+# This script is intended to establish a connection to a Neo4j graph database
+# and submit commands.
+#	This script is an unfinished prototype, and has since been superseded by
+# `graph.py`.
+#
+
+install.packages('tidyverse')
+library(tidyverse)
+install.packages('purrr')
+library(purrr)
+install.packages('devtools')
+library(devtools)
+install_github("davidlrosenblum/neo4r@4.x")
+library(neo4r)
+
+RUNS <- 1000
+DECISION.STEPS <- 12
+
+get_likelihood <- function() {
+  res <- 'MATCH (i:Incident) WHERE EXISTS (i.probability) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.probability AS probability;' %>%
+    call_neo4j(con, type = 'row')
+  
+  res$probability / 100
+}
+
+# Currently only does direct costs
+get_costs <- function() {
+ res <- 'MATCH (i:Incident) WHERE EXISTS (i.direct_costs) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.direct_costs[0] AS cost;' %>%
+    call_neo4j(con, type = 'row')
+ 
+ res$cost
+}
+
+calculate_cost <- function(alpha) {
+  l <- get_likelihood()
+  happen <- runif(1, 0, 1)
+  if (happen >= l) {
+    cost <- as.numeric(get_costs())
+    s <- log(sd(580:630))
+    m <- log(get_costs())
+    #location <- log(m^2 / sqrt(s^2 + m^2))
+    #shape <- sqrt(log(1 + (s^2 / m^2)))
+    rlnorm(1, )
+  } else {
+    0
+  }
+}
+
+con <- neo4j_api$new(
+  url="http://localhost:7474", 
+  db="neo4j", 
+  user="neo4j", 
+  password="password"
+)
+
+simulations <- rerun(RUNS, replicate(DECISION.STEPS, runif(1) %>% calculate_cost())) %>%
+  set_names(paste0("sim", 1:RUNS)) %>%
+  map(~ accumulate(., ~ .x * .y)) %>%
+  map_dfr(~ tibble(value = .x, step = 1:DECISION.STEPS), .id = "simulation")
+
+simulations %>%
+  ggplot(aes(x = step, y = value)) +
+  geom_line(aes(color = simulation)) +
+  theme(legend.position = "none") +
+  ggtitle("Simulations of costs from breaches")
+
+summary_values <- simulations %>%
+  group_by(step) %>%
+  summarise(mean_return = mean(value), max_return = max(value), min_return = min(value)) %>%
+  gather("series", "value", -step)
+
+summary_values %>%
+  ggplot(aes(x = step, y = value)) +
+  geom_line(aes(color = series)) +
+  ggtitle("Mean values from simulations")
--- a/src/regenerate_distributions.py
+++ b/src/regenerate_distributions.py
@ -0,0 +1,332 @@
+"""
+    Distributions (Re)generation Script
+
+    This script generates likelihood and cost distributions based on threat
+    intelligence data stored in a connected Neo4j graph database. It attempts to
+    do so for every possible permutation of (size, industry) values.
+
+    These are then consumed by `montecarlo.py`, which runs a Monte Carlo
+    simulation based on these figures.
+
+    Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
+"""
+
+import os
+import sys
+import argparse
+import warnings
+import logging as log
+
+from typing import Tuple
+
+import itertools
+import numpy as np
+import pandas as pd
+import statsmodels.formula.api as smf
+from matplotlib import pyplot as plt
+from scipy.stats import lognorm
+
+from graph import GraphInterface as gi
+
+# Used for logging, equivalent to `logging.WARNING` + 1.
+SUCCESS = 31
+
+# The arbitrary maximum number of incidents that an organisation can experience
+# in a year.
+MAX_ANNUAL_INCIDENTS = 8000
+
+# Quantifies the quantitative boundaries for human-readable incident frequencies,
+# which many sources (e.g., the CSBS 2020) use to present their results.
+#
+# 'None' = 0
+# 'Annually' = 1
+# 'Less than monthly' = 2–7
+# 'Monthly' = 8–17
+# 'Weekly' = 18–79
+# 'Daily' = 80–399
+# 'More than daily' = 400–8000
+BOUNDARIES = {
+    "None": 0,
+    "Once per year": 1,
+    "Less than once a month": 2,
+    "Once a month": 8,
+    "Once a week": 18,
+    "Once a day": 80,
+    "Several times a day": 400,
+    "MAX": MAX_ANNUAL_INCIDENTS,
+}
+
+OUTPUT_DIR = None
+IMAGES = None
+
+# pylint: disable=invalid-name,anomalous-backslash-in-string
+def _generate_new_incident_frequency_distribution(pairing: Tuple = (None, None)) -> int:
+    """
+    Generates a new incident frequency distribution.
+
+    Notes
+    -----
+
+    (Re)generates the incident frequency distribution for a
+    :math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
+    a Neo4j graph database.
+
+    Currently this only produces log-normal distributions. Additional types of
+    distribution can be implemented by overloading this method (by importing the
+    `multipledispatch` package) and returning the values required for defining
+    that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
+    and :math:`b`).
+    """
+    # pylint: enable=anomalous-backslash-in-string
+
+    log.info("Generating new incident frequency distribution for '%s'...", str(pairing))
+
+    # Attempts to get the incident probabilities for the pairing from the graph
+    # database
+    incident_frequency_probabilities = gi.get_incident_frequency_probabilities(
+        list(BOUNDARIES.values())[:-1], pairing
+    )
+    if incident_frequency_probabilities is None:
+        log.info(
+            "No incident frequency distribution generated for '%s'.",
+            str(pairing),
+        )
+        return 0
+
+    log.debug(
+        "Returned values are: incident frequency probabilities = %s",
+        str(incident_frequency_probabilities),
+    )
+
+    # If values are found, generate a distribution
+    Fs = np.cumsum(incident_frequency_probabilities)
+
+    xs = np.log(list(BOUNDARIES.values())[1:])
+    ys = np.log(1 - Fs)
+    data = pd.DataFrame(xs, ys)
+
+    # pylint: disable=line-too-long
+    # See <https://www.statsmodels.org/stable/_modules/statsmodels/stats/stattools.html#omni_normtest> for explanation
+    # pylint: enable=line-too-long
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        fit = smf.ols(formula="ys ~ xs", data=data).fit()
+        log.debug(fit.summary())
+
+    # Get the parameters for the generated distribution and store them in the
+    # graph database.
+    alogb = fit.params[0]
+    a = -fit.params[1]
+    b = np.exp(alogb / a)
+
+    gi.create_incident_frequency_distribution_node(pairing, a, b)
+
+    log.log(
+        SUCCESS,
+        "New incident frequency distribution successfully generated for '%s'.",
+        str(pairing),
+    )
+    return 1
+
+
+# pylint: enable=invalid-name
+
+# pylint: disable=anomalous-backslash-in-string
+def _generate_new_incident_costs_distribution(pairing: Tuple = (None, None)) -> int:
+    """
+    (Re)generates the incident cost distribution for a
+    :math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
+    a Neo4j graph database.
+
+    Currently this only produces log-normal distributions. Additional types of
+    distribution can be implemented by overloading this method (by importing the
+    `multipledispatch` package) and returning the values required for defining
+    that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
+    and :math:`b`).
+    """
+    # pylint: enable=anomalous-backslash-in-string
+
+    # Plots the distribution for the average cost of incident(s) over 12 months
+    log.info("Generating new incident cost distribution for '%s'...", str(pairing))
+
+    incident_mean_cost, incident_median_cost = gi.get_incident_cost_averages(pairing)
+    if incident_mean_cost is None or incident_median_cost is None:
+        log.info(
+            "No incident costs distribution generated for '%s'.",
+            str(pairing),
+        )
+        return 0
+
+    log.debug(
+        "Returned values are: mean = %s, median = %s",
+        str(incident_mean_cost),
+        str(incident_median_cost),
+    )
+
+    log_stddev = np.sqrt(
+        2
+        * (
+            np.log(incident_mean_cost) - 0
+            if (incident_median_cost == 0)
+            else np.log(incident_median_cost)
+        )
+    )
+    stddev = np.exp(1) ** log_stddev
+
+    _label_plot(
+        "Average annual incident-with-outcome cost distribution", "Cost (£)", "Density"
+    )
+    plt.plot(
+        [
+            lognorm.pdf(
+                np.log(i),
+                np.log(incident_mean_cost),
+                np.log(incident_median_cost) if incident_median_cost > 0 else 0,
+            )
+            for i in range(1, 2500)
+        ]
+    )
+    _save_plot("3 - cost dist")
+
+    gi.create_incident_costs_distribution_node(pairing, incident_mean_cost, stddev)
+
+    log.log(
+        SUCCESS,
+        "New incident costs distribution successfully generated for '%s'.",
+        str(pairing),
+    )
+    return 1
+
+
+def _generate_new_distributions(pairing: Tuple = (None, None)) -> Tuple:
+    """(Re)generates the cost and likelihood distributions."""
+
+    gi.__init__()
+
+    log.info("Existing distributions deleted: %s", bool(gi.delete_distributions()))
+
+    successful_incidents_dists = 0
+    successful_costs_dists = 0
+
+    # If either size or industry is unspecified, gets all possible values.
+    sizes = gi.get_sizes() if pairing[0] is None else [pairing[0]]
+    industries = gi.get_industries() if pairing[1] is None else [pairing[1]]
+
+    # Attempts to generate new distributions for every combination of size and
+    # industry values.
+    for pair in list(itertools.product(sizes, industries)):
+        successful_incidents_dists += _generate_new_incident_frequency_distribution(
+            pair
+        )
+        successful_costs_dists += _generate_new_incident_costs_distribution(pair)
+
+    return successful_incidents_dists, successful_costs_dists
+
+
+def main():
+    """Called when the script is run from the command-line."""
+    # pylint: disable=global-statement
+    global OUTPUT_DIR, IMAGES
+    # pylint: enable=global-statement
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-s",
+        "--size",
+        help="Specify the org. size (default: None)",
+        choices=["micro", "small", "medium", "large"],
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "-i",
+        "--industry",
+        help="Specify the org. industry SIC code (top-level only, e.g. ‘C’ for "
+        "Manufacturing’) (default: None)",
+        choices=list(map(chr, range(65, 86))),
+        type=chr,
+        default=None,
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Specify the output directory (default: ./output/)",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "output/"),
+        metavar="DIRECTORY",
+    )
+    parser.add_argument(
+        "-p",
+        "--images",
+        help="Output images at each step of the script (default: false, just "
+        "output the final LEC image)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        help="Verbose console output (default: false)",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        help="Show debug console output (default: false)",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    OUTPUT_DIR = args.output
+    IMAGES = args.images
+
+    size = args.size
+    industry = args.industry
+
+    if args.debug:
+        log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
+        log.info("Debug output.")
+    elif args.verbose:
+        log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
+        log.info("Verbose output.")
+    else:
+        log.basicConfig(format="%(levelname)s: %(message)s")
+
+    if not os.path.isdir(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+
+    incidents_dists, costs_dists = _generate_new_distributions((size, industry))
+
+    log.log(
+        SUCCESS,
+        "Successfully generated %s incident frequency distributions and %s "
+        "incident costs distributions!",
+        str(incidents_dists),
+        str(costs_dists),
+    )
+
+    sys.exit(0)
+
+
+def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
+    """Apply titles and axis labels to a plot."""
+
+    plt.title(title)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+
+
+def _save_plot(filename="untitled") -> None:
+    """Save a plot and clear the figure."""
+
+    if IMAGES:
+        plt.savefig(OUTPUT_DIR + filename + ".png")
+    plt.clf()
+
+
+if __name__ == "__main__":
+    main()