initial commit

This commit is contained in:
Ben Goldsworthy 2021-04-02 11:10:12 +01:00
commit 51deec9859
14 changed files with 3059 additions and 0 deletions

696
src/graph.py Normal file
View file

@ -0,0 +1,696 @@
"""
Neo4j Graph Database Interface
This module defines:
a) the interface for interacting with the Neo4j graph database; and
b) subclasses of `Relationship`.
"""
import re
import sys
import logging as log
from typing import List, Tuple, Union, Dict
from datetime import datetime
from py2neo import Graph, Node, NodeMatcher, Relationship, DatabaseError
import numpy as np
class GraphInterface:
"""
An interface for the Neo4j graph database used to hold TI data.
This interface abstracts out the actual transactions, allowing a user
to use more friendly methods without worrying about the implementation or
learning the Cypher query language syntax.
This class should:
a) determine the correct transactions to use based on the called
method and any arguments;
b) return only `Node`s, `Relationship`s, `SubGraph`s or lists thereof,
so that the values can be assigned to subclasses of those at the
point of calling; and
c) deal with any `Exception`s, but not issues like returning 0 results,
which should be dealt with at the point of calling.
"""
g: Graph = None
@staticmethod
def __init__():
try:
if GraphInterface.g is None:
GraphInterface.g = Graph(password="test")
log.info("Neo4j database connection opened successfully.")
else:
log.warning(
"Neo4j database already connected - this branch "
"shouldn't have been hit though!"
)
except DatabaseError:
log.error("ERR: Neo4j database connection not successfully opened.")
sys.exit()
@staticmethod
def delete_distributions() -> bool:
"""Deletes any pre-existing distributions."""
GraphInterface.g.run(
"MATCH (n) "
"WHERE n:IncidentFrequencyDistribution OR n:IncidentCostsDistribution "
"DETACH DELETE n;"
)
return True
@staticmethod
def get_incident_frequency_probabilities(
boundaries, pairing: Tuple = ("All", "All")
) -> List[float]:
"""
Attempts to get a list of probabilities for different annual incident
frequencies, specific to the organisational details provided.
It first gets (the average of) any sets of base frequencies, then looks
up the provided size/industry values to see if they have any assigned
breach probability values in the graph database. If multiple values are
found, the average is taken.
Once the specific base (i.e., >0) probability is found, it then recalculates
the overall set of probabilities as proportions of that base figure.
"""
size = pairing[0]
industry = pairing[1]
log.info(
"Attempting to get breach frequency probabilities specific to ('%s', '%s')...",
size,
industry,
)
base_frequency_probabilities_nodes = GraphInterface._get_nodes(
"IncidentBaseFrequencyProbabilities"
)
base_frequency_probabilities = [
node["probabilities"]
for node in base_frequency_probabilities_nodes
if len(node["probabilities"]) == (len(boundaries) - 1)
]
# If there are >1 sets of likelihoods, gets the mean for each boundary value.
if len(base_frequency_probabilities) > 1:
log.info("Multiple sets of base frequencies found, averaging...")
base_frequency_probabilities = np.array(base_frequency_probabilities)
base_frequency_probabilities = [
np.mean(base_frequency_probabilities[:, i])
for i in range(len(boundaries))
]
probability_of_breach = GraphInterface.get_probability_of_breach(size, industry)
if probability_of_breach:
log.info(
"Found specific >0 breaches probability value for one or both "
"of ('%s', '%s'), calculating follow-on values...",
size,
industry,
)
# Sets the probability of having 0 breaches.
breach_frequency_probabilities = [(100 - probability_of_breach) / 100]
# Calculates the remaining probabilities proportional to the sum
# >0 breaches probability.
for base_frequency_probability in base_frequency_probabilities[0]:
breach_frequency_probabilities.append(
(probability_of_breach * base_frequency_probability) / 100
)
if len(breach_frequency_probabilities) != len(boundaries):
raise Exception("Mismatched boundaries!")
return breach_frequency_probabilities
log.info("No breach probability value found.")
return None
# pylint: disable=too-many-branches,too-many-locals,too-many-statements
@staticmethod
def get_probability_of_breach(size="All", industry="All") -> float:
"""
Returns the probability of an organisation of a given size and/or
industry experiencing a breach with an outcome in the next year.
Where a match exists for both size and industry, size is chosen as it
assumed that organisations of a similar size will have a more similar
threat model than organisations within the same industry. This assumption
is not empirically grounded, however, so it may be that the opposite
is true.
"""
size_probability = None
industry_probability = None
size_node = GraphInterface._get_node("Size", name=size)
if size_node:
log.info("Found node for size '%s'.", size)
else:
log.info("No node found for size '%s'.", size)
industry_node = GraphInterface._get_node("Industry", name=industry)
if industry_node:
log.info("Found node for industry '%s'.", industry)
else:
log.info("No node found for industry '%s'.", industry)
# If no figures were found for this pairing, returns None.
if size_node is None and industry_node is None:
return None
if size_node:
size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
size_probabilities = []
for rel in size_relations:
if rel.start_node.has_label("IncidentProbability"):
size_probabilities.append(rel.start_node["probability"])
if len(size_probabilities) > 1:
log.info(
"Multiple probabilities found for size '%s', averaging...", size
)
size_probability = sum(size_probabilities) / len(size_probabilities)
elif len(size_probabilities) == 1:
log.info("Probability value found for size '%s'.", size)
size_probability = size_probabilities[0]
else:
log.info("No probability value found for size '%s'.", size)
if industry_node:
industry_relations = GraphInterface.g.match(
{industry_node}, r_type=FOR_INDUSTRY
)
industry_probabilities = []
for rel in industry_relations:
if rel.start_node.has_label("IncidentProbability"):
industry_probabilities.append(rel.start_node["probability"])
if len(industry_probabilities) > 1:
log.info(
"Multiple probabilities found for industry '%s', averaging...",
industry,
)
industry_probability = sum(industry_probabilities) / len(
industry_probabilities
)
elif len(industry_probabilities) == 1:
log.info("Probability value found for industry '%s'.", industry)
industry_probability = industry_probabilities[0]
else:
log.info("No probability value found for industry '%s'.", industry)
if size_probability and industry_probability:
log.info(
"Probabilities found for both size '%s' and industry '%s', averaging...",
size,
industry,
)
probability = (size_probability + industry_probability) / 2
else:
probability = size_probability or industry_probability
return probability
# pylint: enable=too-many-branches,too-many-locals,too-many-statements
# pylint: disable=too-many-branches,too-many-locals,too-many-statements
@staticmethod
def get_incident_cost_averages(
pairing: Tuple = ("All", "All")
) -> Tuple[float, float]:
"""
Attempts to get the average incident costs over a year, specific to the
organisational details provided.
The CSBS specifies figures for breaches both 'with' and 'without outcomes'.
We have ignored the latter here.
"""
size = pairing[0]
industry = pairing[1]
size_mean = None
size_median = None
industry_mean = None
industry_median = None
log.info(
"Attempting to get incident cost averages specific to ('%s', '%s')...",
size,
industry,
)
size_node = GraphInterface._get_node("Size", name=size)
if size_node:
log.info("Found node for size '%s'.", size)
else:
log.info("No node found for size '%s'.", size)
industry_node = GraphInterface._get_node("Industry", name=industry)
if industry_node:
log.info("Found node for industry '%s'.", industry)
else:
log.info("No node found for industry '%s'.", industry)
# If no figures were found for this pairing, returns None.
if size_node is None and industry_node is None:
return None
if size_node:
size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
size_means = []
size_medians = []
for rel in size_relations:
if rel.start_node.has_label("IncidentCostAverages"):
size_means.append(rel.start_node["mean"])
size_medians.append(rel.start_node["median"])
# Converts however many mean and median values returned into one of
# each.
if len(size_means) > 1:
log.info("Multiple mean values found for size '%s', averaging...", size)
size_mean = sum(size_means) / len(size_means)
elif len(size_means) == 1:
log.info("Mean value found for size '%s'.", size)
size_mean = size_means[0]
else:
log.info("No mean values found for size '%s'.", size)
if len(size_medians) > 1:
log.info(
"Multiple median values found for size '%s', averaging...", size
)
size_median = sum(size_medians) / len(size_medians)
elif len(size_medians) == 1:
log.info("Median value found for size '%s'.", size)
size_median = size_medians[0]
else:
log.info("No median values found for size '%s'.", size)
if industry_node:
industry_relations = GraphInterface.g.match(
{industry_node}, r_type=FOR_INDUSTRY
)
industry_means = []
industry_medians = []
for rel in industry_relations:
if rel.start_node.has_label("IncidentCostAverages"):
industry_means.append(rel.start_node["mean"])
industry_medians.append(rel.start_node["median"])
# Converts however many mean and median values returned into one of
# each.
if len(industry_means) > 1:
log.info(
"Multiple mean values found for industry '%s', averaging...",
industry,
)
industry_mean = sum(industry_means) / len(industry_means)
elif len(industry_means) == 1:
log.info("Mean value found for industry '%s'.", industry)
industry_mean = industry_means[0]
else:
log.info("No mean values found for industry '%s'.", industry)
if len(industry_medians) > 1:
log.info(
"Multiple median values found for industry '%s', averaging...",
industry,
)
industry_median = sum(industry_medians) / len(industry_medians)
elif len(industry_medians) == 1:
log.info("Median value found for industry '%s'.", industry)
industry_median = industry_medians[0]
else:
log.info("No median values found for industry '%s'.", industry)
if size_mean and industry_mean:
log.info(
"Mean values found for both size '%s' and industry '%s', averaging...",
size,
industry,
)
mean = (size_mean + industry_mean) / 2
else:
mean = size_mean or industry_mean
if size_median and industry_median:
log.info(
"Median values found for both size '%s' and industry '%s', averaging...",
size,
industry,
)
median = (size_median + industry_median) / 2
else:
median = size_median or industry_median
return mean, median
# pylint: enable=too-many-branches,too-many-locals,too-many-statements
# pylint: disable=invalid-name,anomalous-backslash-in-string
@staticmethod
def get_incident_frequency_distribution(
pairing: Tuple = ("All", "All")
) -> Union[Tuple[float, float], None]:
"""
Returns the most relevant available incident frequency distribution for
a given pairing.
The algorithm for determining this is currently very basic:
1. search for an exact match for the pairing, and return that if found; else
2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
In future, this can and should be expanded to follow complex heuristics
for similarity (and some relationships for doing so are provided at the
end of this module). For example, two industries can be joined using the
SIMILAR_TO relationship, which would allow the algorithm to traverse
laterally to other leaf nodes.
An even simpler improvement would be to add handling for partial matches
(e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
values will be.
"""
# pylint: enable=anomalous-backslash-in-string
size = pairing[0]
industry = pairing[1]
size_node = GraphInterface._get_node("Size", name=size)
if size_node:
log.info("Found node for size '%s'.", size)
else:
log.info("No node found for size '%s'.", size)
industry_node = GraphInterface._get_node("Industry", name=industry)
if industry_node:
log.info("Found node for industry '%s'.", industry)
else:
log.info("No node found for industry '%s'.", industry)
# If no figures were found for this pairing, returns the fallback values.
if size_node is None and industry_node is None:
return GraphInterface._get_frequency_distribution()
dist: Union[
Dict[float, float], None
] = GraphInterface._get_frequency_distribution(size, industry)
if dist is not None:
log.debug(
"Returned values are: a = %s, b = %s", str(dist["a"]), str(dist["b"])
)
return dist
# pylint: enable=invalid-name
# pylint: disable=anomalous-backslash-in-string
@staticmethod
def get_incident_costs_distribution(
pairing: Tuple = ("All", "All")
) -> Union[Tuple[float, float], None]:
"""
Returns the most relevant available incident costs distribution for
a given pairing.
The algorithm for determining this is currently very basic:
1. search for an exact match for the pairing, and return that if found; else
2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
In future, this can and should be expanded to follow complex heuristics
for similarity (and some relationships for doing so are provided at the
end of this module). For example, two industries can be joined using the
SIMILAR_TO relationship, which would allow the algorithm to traverse
laterally to other leaf nodes.
An even simpler improvement would be to add handling for partial matches
(e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
values will be.
"""
# pylint: enable=anomalous-backslash-in-string
size = pairing[0]
industry = pairing[1]
size_node = GraphInterface._get_node("Size", name=size)
if size_node:
log.info("Found node for size '%s'.", size)
else:
log.info("No node found for size '%s'.", size)
industry_node = GraphInterface._get_node("Industry", name=industry)
if industry_node:
log.info("Found node for industry '%s'.", industry)
else:
log.info("No node found for industry '%s'.", industry)
# If no figures were found for this pairing, returns the fallback values.
if size_node is None and industry_node is None:
return GraphInterface._get_costs_distribution()
dist: Union[Dict[float, float], None] = GraphInterface._get_costs_distribution(
size, industry
)
if dist is not None:
log.debug(
"Returned values are: mean = %s, stddev = %s",
str(dist["mean"]),
str(dist["stddev"]),
)
return dist
@staticmethod
def get_sizes() -> List[str]:
"""Returns a list of all of the organisation size values."""
nodes = GraphInterface._get_nodes("Size")
return [node["name"] for node in nodes]
@staticmethod
def get_industries() -> List[str]:
"""Returns a list of all of the organisation industry values."""
nodes = GraphInterface._get_nodes("Industry")
return [node["name"] for node in nodes]
@staticmethod
def get_sizes_and_industries() -> Tuple[list, list]:
"""Returns all available organisation size and industry values."""
return GraphInterface.get_sizes(), GraphInterface.get_industries()
# pylint: disable=invalid-name
@staticmethod
def create_incident_frequency_distribution_node(
pairing: Tuple, a: float, b: float
) -> Node:
"""Adds an `IncidentFrequencyDistribution` node to the Neo4j graph database."""
size_node = GraphInterface._get_node("Size", name=pairing[0])
industry_node = GraphInterface._get_node("Industry", name=pairing[1])
node = GraphInterface._create_node(
"IncidentFrequencyDistribution", a=a, b=b, calculated_at=datetime.now()
)
GraphInterface._create_relationship(node, FOR_SIZE, size_node)
GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
return node
# pylint: enable=invalid-name
@staticmethod
def create_incident_costs_distribution_node(
pairing: Tuple, mean: float, stddev: float
) -> Node:
"""Adds an `IncidentCostsDistribution` node to the Neo4j graph database."""
size_node = GraphInterface._get_node("Size", name=pairing[0])
industry_node = GraphInterface._get_node("Industry", name=pairing[1])
node = GraphInterface._create_node(
"IncidentCostsDistribution",
mean=mean,
stddev=stddev,
calculated_at=datetime.now(),
)
GraphInterface._create_relationship(node, FOR_SIZE, size_node)
GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
return node
# pylint: disable=anomalous-backslash-in-string,invalid-name
@staticmethod
def _get_frequency_distribution(
size: str = "All", industry: str = "All"
) -> Dict[float, float]:
"""
Returns the :math:`a` and :math:`b` values from the requested incident
frequency distribution node (if it exists). Call with no arguments to
use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
"""
# pylint: enable=anomalous-backslash-in-string
# pylint: disable=line-too-long
result = GraphInterface.g.run(
"MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentFrequencyDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
"RETURN node;".format(size, industry)
)
# pylint: enable=line-too-long
nodes = [record["node"] for record in result]
if len(nodes) == 0:
# There should always be a (All, All) distribution at least.
if size == "All" and industry == "All":
raise Exception("No fallback node found!")
log.debug(
"No incident frequency distribution found for (%s, %s).",
str(size),
str(industry),
)
return None, None
log.debug("Results: %s", str(nodes))
a = [node["a"] for node in nodes]
b = [node["b"] for node in nodes]
if len(nodes) > 0:
log.info("Multiple fallback nodes found, averaging parameters...")
a = sum(a) / len(a)
b = sum(b) / len(b)
else:
a = a[0]
b = b[0]
return {"a": a, "b": b}
# pylint: enable=invalid-name
# pylint: disable=anomalous-backslash-in-string
@staticmethod
def _get_costs_distribution(
size: str = "All", industry: str = "All"
) -> Dict[float, float]:
"""
Returns the :math:`a` and :math:`b` values from the requested incident
frequency distribution node (if it exists). Call with no arguments to
use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
"""
# pylint: enable=anomalous-backslash-in-string
# pylint: disable=line-too-long
result = GraphInterface.g.run(
"MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentCostsDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
"RETURN node;".format(size, industry)
)
# pylint: enable=line-too-long
nodes = [record["node"] for record in result]
if len(nodes) == 0:
# There should always be a (All, All) distribution at least.
if size == "All" and industry == "All":
raise Exception("No fallback node found!")
log.debug(
"No incident frequency distribution found for (%s, %s).",
str(size),
str(industry),
)
return None, None
log.debug("Results: %s", str(nodes))
mean = [node["mean"] for node in nodes]
stddev = [node["stddev"] for node in nodes]
if len(nodes) > 1:
log.info("Multiple fallback nodes found, averaging parameters...")
mean = sum(mean) / len(mean)
stddev = sum(stddev) / len(stddev)
else:
mean = mean[0]
stddev = stddev[0]
return {"mean": mean, "stddev": stddev}
# pylint: disable=invalid-name
@staticmethod
def _create_node(*labels, **properties) -> Node:
"""Creates a new node in the Neo4j graph database."""
tx = GraphInterface.g.begin()
node = Node(*labels, **properties)
tx.create(node)
tx.commit()
return node
# pylint: enable=invalid-name
# pylint: disable=invalid-name
@staticmethod
def _create_relationship(
start_node, relationship, end_node, **properties
) -> Relationship:
"""Creates a new relationship in the Neo4j graph database."""
tx = GraphInterface.g.begin()
relationship = Relationship(
start_node, relationship.__name__, end_node, **properties
)
tx.create(relationship)
tx.commit()
return relationship
# pylint: enable=invalid-name
@staticmethod
def _get_node(*labels, **properties) -> Union[Node, None]:
"""Returns a node from the Neo4j graph database."""
return GraphInterface.g.nodes.match(*labels, **properties).first()
@staticmethod
def _get_nodes(*labels, **properties) -> NodeMatcher:
"""Returns a node from the Neo4j graph database."""
return GraphInterface.g.nodes.match(*labels, **properties)
@staticmethod
def _dict_to_jsobj(properties) -> str:
"""Recursively converts a Python `dict` into a JS `Object`."""
if isinstance(properties, dict):
return re.sub("'([a-z_]*)':", "\\1:", str(properties))
if isinstance(properties, str):
return GraphInterface._dict_to_jsobj({"name": properties})
return "{}"
# pylint: disable=invalid-name,missing-class-docstring
class SUBSECTION_OF(Relationship):
pass
class SECTION_OF(Relationship):
pass
class SIMILAR_TO(Relationship):
pass
class FOR_SIZE(Relationship):
pass
class FOR_INDUSTRY(Relationship):
pass
# pylint: enable=invalid-name,missing-class-docstring

420
src/montecarlo.py Normal file
View file

@ -0,0 +1,420 @@
"""
Monte Carlo Simulation Script
This script runs a Monte Carlo simulation for an organisation of a given
size and industry, utilising the most relevant available available.
Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
"""
import os
import sys
import argparse
import pickle
import logging as log
from typing import Tuple, Dict, Union
import random
import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from graph import GraphInterface as gi
# Used for logging, equivalent to `logging.INFO`.
SUCCESS = 20
# If not specified, the default number of Monte Carlo simulation runs to perform.
DEFAULT_RUNS = 5000
# The arbitrary maximum number of incidents that an organisation can experience
# in a year.
MAX_ANNUAL_INCIDENTS = 8000
# The maximum value of a company; any yearly losses over result in a bankruptcy
COMPANY_VALUE = 100000
# A smaller value increases the curviness of the loss exeedence curve.
# Less than 30 starts to get a bit steppy though.
LEC_PRECISION = math.floor(COMPANY_VALUE / 30)
# Quantifies the quantitative boundaries for human-readable incident frequencies,
# which many sources (e.g., the CSBS 2020) use to present their results.
#
# 'None' = 0
# 'Annually' = 1
# 'Less than monthly' = 27
# 'Monthly' = 817
# 'Weekly' = 1879
# 'Daily' = 80399
# 'More than daily' = 4008000
BOUNDARIES = {
"None": 0,
"Once per year": 1,
"Less than once a month": 2,
"Once a month": 8,
"Once a week": 18,
"Once a day": 80,
"Several times a day": 400,
"MAX": MAX_ANNUAL_INCIDENTS,
}
N = None
OUTPUT_DIR = None
IMAGES = None
FORCE = None
def _calculate_num_of_incidents(incidents_dist: Dict[float, float]) -> float:
"""Calculate how many incidents have occurred in a given year."""
log.debug("Incident distribution: %s", str(incidents_dist))
num_of_incidents = incidents_dist["b"] / (1 - np.random.uniform()) ** (
1 / incidents_dist["a"]
)
log.debug("Number of incidents (as `int`): %s", str(int(num_of_incidents)))
return (
int(num_of_incidents)
if num_of_incidents <= MAX_ANNUAL_INCIDENTS
else MAX_ANNUAL_INCIDENTS
)
def _calculate_sum_cost_of_incidents(
num_of_incidents: int, costs_dist: Dict[float, float], idx: int = None
) -> float:
"""For a list of incident numbers, calculate how much each breach cost and
return the sum."""
log.debug("Costs distribution: %s", str(costs_dist))
if (N < 1000) or (N >= 1000 and idx % math.floor(N / 100) == 0):
log.info(
"Running Monte Carlo simulation... (%s/%s iterations)", str(idx), str(N)
)
if num_of_incidents == 0:
return 0
loc = np.log(
costs_dist["mean"] ** 2
/ np.sqrt(costs_dist["stddev"] ** 2 + costs_dist["mean"] ** 2)
)
shape = np.sqrt(np.log(1 + (costs_dist["stddev"] ** 2 / costs_dist["mean"] ** 2)))
costs = [random.lognormvariate(loc, shape) for r in range(num_of_incidents)]
return sum(costs)
# pylint: disable=invalid-name
def _get_most_relevant_incident_frequency_distribution(
pairing: Tuple = ("All", "All")
) -> Union[Dict[float, float], None]:
"""Gets the distribution for incident frequency from the data in the Neo4j
graph database."""
log.info(
"Finding most relevant incident frequency distribution for %s...", str(pairing)
)
return gi.get_incident_frequency_distribution(pairing)
# pylint: enable=invalid-name
def _get_most_relevant_incident_costs_distribution(
pairing: Tuple = ("All", "All")
) -> Union[Dict[float, float], None]:
"""Gets the distribution for incident costs from the data in the Neo4j
graph database."""
log.info(
"Finding most relevant incident costs distribution for %s...", str(pairing)
)
return gi.get_incident_costs_distribution(pairing)
def _get_most_relevant_distributions(
pairing: Tuple = ("All", "All")
) -> Dict[Union[Dict[float, float], None], Union[Dict[float, float], None]]:
"""Generate (or retrieve) a population of annual incident quantities and a
distribution of incident-with-outcome cost values."""
# -- caching --
# Retrieves previously-calculated values if possible
if not FORCE and OUTPUT_DIR is not None:
try:
filename = "{}-{}.pickle".format(pairing[0], pairing[1])
dists = pickle.load(open(OUTPUT_DIR + filename, "rb"))
log.info("Previously-calculated distributions found")
return dists["incidents"], dists["costs"]
except (OSError, IOError):
log.info("Previously-calculated distributions not found")
# Otherwise, generates fresh ones
gi.__init__()
incidents_dist = _get_most_relevant_incident_frequency_distribution(pairing)
costs_dist = _get_most_relevant_incident_costs_distribution(pairing)
log.debug(
"Returned values are: incidents_dist = %s, costs_dist = %s",
str(incidents_dist),
str(costs_dist),
)
# Saves the figures for faster analysis in future
if OUTPUT_DIR is not None and incidents_dist is not None and costs_dist is not None:
dists = {
"incidents": incidents_dist,
"costs": costs_dist,
}
filename = "{}-{}.pickle".format(pairing[0], pairing[1])
pickle.dump(dists, open(OUTPUT_DIR + filename, "wb"))
return incidents_dist, costs_dist
# pylint: disable=anomalous-backslash-in-string
def _run_monte_carlo_simulation(pairing: Tuple = ("All", "All")) -> None:
"""
Runs :math:`n` simulations of a 12-month period, calculating the number
of incidents encountered each time and their cumulative costs.
"""
# pylint: enable=anomalous-backslash-in-string
# Generates both distributions
incidents_dist, costs_dist = _get_most_relevant_distributions(pairing)
if incidents_dist is None and costs_dist is None:
return incidents_dist, costs_dist
# Calculates the number of incidents suffered over $n$ simulated years
nums_of_incidents = np.array(
[_calculate_num_of_incidents(incidents_dist) for i in range(N)]
)
log.debug("Number of incidents: %s", str(nums_of_incidents))
_label_plot(
"Histogram of Incident Frequencies (over 12 months)",
"Number of Incidents ($log_{10}$)",
"Frequency",
)
plt.hist(
[np.log10(i) if i > 0 else 0 for i in nums_of_incidents],
align="left",
bins=range(12),
)
_save_plot("2 - histogram of incident frequencies")
# Calculates the annual costs for each simulated year
log.info("Running Monte Carlo simulation... (0/%s iterations)", str(N))
sum_costs = [
_calculate_sum_cost_of_incidents(num_of_incidents, costs_dist, idx)
for idx, num_of_incidents in enumerate(nums_of_incidents, start=1)
]
log.info("Running Monte Carlo simulation... (%s/%s iterations)", str(N), str(N))
_label_plot(
"Histogram of Sum Costs (over 12 months)", "Total Cost (£)", "Frequency"
)
plt.ticklabel_format(style="plain")
plt.hist(sum_costs, align="left", bins=15, range=(0, COMPANY_VALUE))
_save_plot("4 - histogram of sum costs")
_label_plot("Density of Sum Costs (over 12 months)", "Total Cost (£)", "Density")
pd.Series(sum_costs).plot(kind="density")
plt.xlim(0, COMPANY_VALUE * 2)
plt.ticklabel_format(style="plain")
_save_plot("5 - density of sum costs")
# Get loss exceedance curve
log.info("Generating loss exceedance curve")
hist, edges = np.histogram(sum_costs, bins=LEC_PRECISION)
cumrev = np.cumsum(hist[::-1])[::-1] * 100 / len(sum_costs)
_label_plot(
"Loss Exceedance Curve (Monte Carlo sim)",
"Loss (£, 99th percentile)",
"Chance of Loss or Greater (%)",
)
plt.ticklabel_format(style="plain")
plt.xlim(0, COMPANY_VALUE)
plt.plot(edges[:-1], cumrev)
_save_plot("6 - lec" if IMAGES else "lec")
log.info("Simulation complete!")
return nums_of_incidents, sum_costs
def main():
"""Called when the script is run from the command-line"""
# pylint: disable=global-statement
global N, OUTPUT_DIR, IMAGES, FORCE
# pylint: enable=global-statement
parser = argparse.ArgumentParser()
parser.add_argument(
"-n",
"--number",
help="The number of simulations to run (default: " + str(DEFAULT_RUNS) + ")",
type=int,
default=DEFAULT_RUNS,
)
parser.add_argument(
"-s",
"--size",
help="The size of the organisation to simulate (default: all)",
type=str,
default="All",
)
parser.add_argument(
"-i",
"--industry",
help="The industry of the organisation to simulate (default: all)",
type=str,
default="All",
)
parser.add_argument(
"-o",
"--output",
help="Specify the output directory (default: ./output/)",
type=str,
default=os.path.join(os.path.dirname(__file__), "output/"),
metavar="DIRECTORY",
)
parser.add_argument(
"-p",
"--images",
help="Output images at each step of the script (default: false, just \
output the final LEC image)",
action="store_true",
default=False,
)
parser.add_argument(
"-f",
"--force",
help="Force re-generation of incident and cost distributions (default: false)",
action="store_true",
default=False,
)
parser.add_argument(
"-v",
"--verbose",
help="Verbose console output (default: false)",
action="store_true",
default=False,
)
parser.add_argument(
"-d",
"--debug",
help="Show debug console output (default: false)",
action="store_true",
default=False,
)
args = parser.parse_args()
N = args.number
OUTPUT_DIR = args.output
IMAGES = args.images
FORCE = args.force
size = args.size
industry = args.industry
if args.debug:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
log.info("Debug output.")
elif args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
log.info("Verbose output.")
else:
log.basicConfig(format="%(levelname)s: %(message)s")
if not os.path.isdir(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
if size or industry:
print("Running simulation for ({}, {})".format(size, industry))
nums_of_incidents, sum_costs = _run_monte_carlo_simulation((size, industry))
if nums_of_incidents is not None and sum_costs is not None:
log.info(
"Results:\nNumbers of incidents: %s\nSum costs: %s\n",
str(nums_of_incidents),
str(sum_costs),
)
avg_num_of_incidents = int(sum(nums_of_incidents) / len(nums_of_incidents))
avg_sum_costs = sum(sum_costs) / len(sum_costs)
log.log(
SUCCESS,
"Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
avg_num_of_incidents,
avg_sum_costs,
)
# Print output that will be picked up by game server.
# pylint: disable=fixme
# TODO: For some reason the results at the moment are orders of magnitude
# too high, so for now I've plugged it by dividing both results by 100.
# pylint: enable=fixme
print(int(avg_num_of_incidents / 100))
print("%.2f" % (avg_sum_costs / 100))
else:
log.warning("No data found.")
print("No data found.")
print("Running simulation for (All, All)")
gen_nums_of_incidents, gen_sum_costs = _run_monte_carlo_simulation()
log.info(
"Results:\nNumbers of incidents: %s\nSum costs: %s\n",
str(gen_nums_of_incidents),
str(gen_sum_costs),
)
avg_gen_num_of_incidents = int(
sum(gen_nums_of_incidents) / len(gen_nums_of_incidents)
)
avg_gen_sum_costs = sum(gen_sum_costs) / len(gen_sum_costs)
log.log(
SUCCESS,
"Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
avg_gen_num_of_incidents,
avg_gen_sum_costs,
)
# Print output that will be picked up by the game server.
print(int(avg_gen_num_of_incidents / 100))
print("%.2f" % (avg_gen_sum_costs / 100))
sys.exit(0)
def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
"""Apply titles and axis labels to a plot."""
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
def _save_plot(filename="untitled") -> None:
"""Save a plot and clear the figure."""
if IMAGES:
plt.savefig(OUTPUT_DIR + filename + ".png")
plt.clf()
if __name__ == "__main__":
main()

103
src/montecarlo.r Normal file
View file

@ -0,0 +1,103 @@
#
# Secure Digitalisation Monte Carlo Simulation Script
#
# This script runs a Monto Carlo simulation using breach likelihood and cost
# figures derived from the Cyber Security Breaches Survey 2020 (CSBS).
# This script is an unfinished prototype, and has since been superseded by
# `montecarlo.py`.
#
# Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
#
masses = c(0.54, 0.1058, 0.1012, 0.0966, 0.069, 0.0368, 0.0414)
boundaries = c(1, 2, 8, 18, 80, 400, 8000)
Fs = cumsum(masses)
plot(log(boundaries), log(1 - Fs))
xs = log(boundaries)
ys = log(1 - Fs)
fit = lm(ys ~ xs)
summary(fit)
alogb = fit$coeff[1]
a = -fit$coeff[2]
b = exp(alogb/a)
print(a)
print(b)
n = 10000
us = runif(n)
xs = b / (1 - us)^(1 / a)
print()
p0 = mean(xs < boundaries[1])
p1 = mean(xs < boundaries[2]) - p0
p2 = mean(xs < boundaries[3]) - p0 - p1
p3 = mean(xs < boundaries[4]) - p0 - p1 - p2
p4 = mean(xs < boundaries[5]) - p0 - p1 - p2 - p3
p5 = mean(xs < boundaries[6]) - p0 - p1 - p2 - p3 - p4
ps = c(p0, p1, p2, p3, p4, p5, 1 - (p0 + p1 + p2 + p3 + p4 + p5))
print(ps)
print(masses)
nattacks = floor(xs)
hist(log10(nattacks),
main = "Histogram of Number of Attacks/Breaches Over 12 Months",
xlab = expression("Number of Attacks (log"[10]*")"),
ylab = "Frequency",
breaks = 0:12)
# Plots the distribution for the average cost of breach(es) over 12 months
mean = 3230
median = 274
logstd = sqrt(2 * (log(mean) - if (median == 0) 0 else log(median)))
std = exp(1)^logstd
curve(dlnorm(x, log(mean), log(std)), from=1, to=5000,
main = "Average annual breach cost distribution",
xlab = 'Cost (£)',
ylab = 'Density',
lwd = 2)
# Runs the MonteCarlo simulation
simulateCosts <- function(n) {
return(if (n >= 1) sum(rlnorm(n, loc, shape)) else 0)
}
n = 10000
loc <- log(mean^2 / sqrt(std^2 + mean^2))
shape <- sqrt(log(1 + (std^2 / mean^2)))
numAttacks <- sample(log10(nattacks), n)
results <- sapply(numAttacks, simulateCosts)
hist(results,
main = "Histogram of Total Costs Over 12 Months (Monte Carlo sim)",
xlab = "Total cost (£)")
d <- density(results)
plot(d,
main="Density of Total Costs Over 12 Months (Monte Carlo sim)",
xlab=expression("Total Cost (£)"),
ylab="Density")
# Get loss exceedance
# TODO: needs to be prettier, but `evaluate::loss_exceedance_curve()` is broken
maxValue = 2500
numOver <- length(results[results > maxValue])
risk = numOver/n
plot(d,
main="Loss Exceedance (Monte Carlo sim)",
xlab=expression("Total Cost (£)"),
ylab="Density")
abline(v = maxValue, col="red", lwd=3, lty=2)
text(3000, 4e-04, labels=paste(floor(risk*100), "% chance of ≥£", maxValue, " losses"), adj=c(0, 0.5))

View file

@ -0,0 +1,78 @@
#
# Secure Digitalisation Neo4j Connection Script
#
# This script is intended to establish a connection to a Neo4j graph database
# and submit commands.
# This script is an unfinished prototype, and has since been superseded by
# `graph.py`.
#
install.packages('tidyverse')
library(tidyverse)
install.packages('purrr')
library(purrr)
install.packages('devtools')
library(devtools)
install_github("davidlrosenblum/neo4r@4.x")
library(neo4r)
RUNS <- 1000
DECISION.STEPS <- 12
get_likelihood <- function() {
res <- 'MATCH (i:Incident) WHERE EXISTS (i.probability) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.probability AS probability;' %>%
call_neo4j(con, type = 'row')
res$probability / 100
}
# Currently only does direct costs
get_costs <- function() {
res <- 'MATCH (i:Incident) WHERE EXISTS (i.direct_costs) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.direct_costs[0] AS cost;' %>%
call_neo4j(con, type = 'row')
res$cost
}
calculate_cost <- function(alpha) {
l <- get_likelihood()
happen <- runif(1, 0, 1)
if (happen >= l) {
cost <- as.numeric(get_costs())
s <- log(sd(580:630))
m <- log(get_costs())
#location <- log(m^2 / sqrt(s^2 + m^2))
#shape <- sqrt(log(1 + (s^2 / m^2)))
rlnorm(1, )
} else {
0
}
}
con <- neo4j_api$new(
url="http://localhost:7474",
db="neo4j",
user="neo4j",
password="password"
)
simulations <- rerun(RUNS, replicate(DECISION.STEPS, runif(1) %>% calculate_cost())) %>%
set_names(paste0("sim", 1:RUNS)) %>%
map(~ accumulate(., ~ .x * .y)) %>%
map_dfr(~ tibble(value = .x, step = 1:DECISION.STEPS), .id = "simulation")
simulations %>%
ggplot(aes(x = step, y = value)) +
geom_line(aes(color = simulation)) +
theme(legend.position = "none") +
ggtitle("Simulations of costs from breaches")
summary_values <- simulations %>%
group_by(step) %>%
summarise(mean_return = mean(value), max_return = max(value), min_return = min(value)) %>%
gather("series", "value", -step)
summary_values %>%
ggplot(aes(x = step, y = value)) +
geom_line(aes(color = series)) +
ggtitle("Mean values from simulations")

View file

@ -0,0 +1,332 @@
"""
Distributions (Re)generation Script
This script generates likelihood and cost distributions based on threat
intelligence data stored in a connected Neo4j graph database. It attempts to
do so for every possible permutation of (size, industry) values.
These are then consumed by `montecarlo.py`, which runs a Monte Carlo
simulation based on these figures.
Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
"""
import os
import sys
import argparse
import warnings
import logging as log
from typing import Tuple
import itertools
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
from scipy.stats import lognorm
from graph import GraphInterface as gi
# Used for logging, equivalent to `logging.WARNING` + 1.
SUCCESS = 31
# The arbitrary maximum number of incidents that an organisation can experience
# in a year.
MAX_ANNUAL_INCIDENTS = 8000
# Quantifies the quantitative boundaries for human-readable incident frequencies,
# which many sources (e.g., the CSBS 2020) use to present their results.
#
# 'None' = 0
# 'Annually' = 1
# 'Less than monthly' = 27
# 'Monthly' = 817
# 'Weekly' = 1879
# 'Daily' = 80399
# 'More than daily' = 4008000
BOUNDARIES = {
"None": 0,
"Once per year": 1,
"Less than once a month": 2,
"Once a month": 8,
"Once a week": 18,
"Once a day": 80,
"Several times a day": 400,
"MAX": MAX_ANNUAL_INCIDENTS,
}
OUTPUT_DIR = None
IMAGES = None
# pylint: disable=invalid-name,anomalous-backslash-in-string
def _generate_new_incident_frequency_distribution(pairing: Tuple = (None, None)) -> int:
"""
Generates a new incident frequency distribution.
Notes
-----
(Re)generates the incident frequency distribution for a
:math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
a Neo4j graph database.
Currently this only produces log-normal distributions. Additional types of
distribution can be implemented by overloading this method (by importing the
`multipledispatch` package) and returning the values required for defining
that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
and :math:`b`).
"""
# pylint: enable=anomalous-backslash-in-string
log.info("Generating new incident frequency distribution for '%s'...", str(pairing))
# Attempts to get the incident probabilities for the pairing from the graph
# database
incident_frequency_probabilities = gi.get_incident_frequency_probabilities(
list(BOUNDARIES.values())[:-1], pairing
)
if incident_frequency_probabilities is None:
log.info(
"No incident frequency distribution generated for '%s'.",
str(pairing),
)
return 0
log.debug(
"Returned values are: incident frequency probabilities = %s",
str(incident_frequency_probabilities),
)
# If values are found, generate a distribution
Fs = np.cumsum(incident_frequency_probabilities)
xs = np.log(list(BOUNDARIES.values())[1:])
ys = np.log(1 - Fs)
data = pd.DataFrame(xs, ys)
# pylint: disable=line-too-long
# See <https://www.statsmodels.org/stable/_modules/statsmodels/stats/stattools.html#omni_normtest> for explanation
# pylint: enable=line-too-long
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fit = smf.ols(formula="ys ~ xs", data=data).fit()
log.debug(fit.summary())
# Get the parameters for the generated distribution and store them in the
# graph database.
alogb = fit.params[0]
a = -fit.params[1]
b = np.exp(alogb / a)
gi.create_incident_frequency_distribution_node(pairing, a, b)
log.log(
SUCCESS,
"New incident frequency distribution successfully generated for '%s'.",
str(pairing),
)
return 1
# pylint: enable=invalid-name
# pylint: disable=anomalous-backslash-in-string
def _generate_new_incident_costs_distribution(pairing: Tuple = (None, None)) -> int:
"""
(Re)generates the incident cost distribution for a
:math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
a Neo4j graph database.
Currently this only produces log-normal distributions. Additional types of
distribution can be implemented by overloading this method (by importing the
`multipledispatch` package) and returning the values required for defining
that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
and :math:`b`).
"""
# pylint: enable=anomalous-backslash-in-string
# Plots the distribution for the average cost of incident(s) over 12 months
log.info("Generating new incident cost distribution for '%s'...", str(pairing))
incident_mean_cost, incident_median_cost = gi.get_incident_cost_averages(pairing)
if incident_mean_cost is None or incident_median_cost is None:
log.info(
"No incident costs distribution generated for '%s'.",
str(pairing),
)
return 0
log.debug(
"Returned values are: mean = %s, median = %s",
str(incident_mean_cost),
str(incident_median_cost),
)
log_stddev = np.sqrt(
2
* (
np.log(incident_mean_cost) - 0
if (incident_median_cost == 0)
else np.log(incident_median_cost)
)
)
stddev = np.exp(1) ** log_stddev
_label_plot(
"Average annual incident-with-outcome cost distribution", "Cost (£)", "Density"
)
plt.plot(
[
lognorm.pdf(
np.log(i),
np.log(incident_mean_cost),
np.log(incident_median_cost) if incident_median_cost > 0 else 0,
)
for i in range(1, 2500)
]
)
_save_plot("3 - cost dist")
gi.create_incident_costs_distribution_node(pairing, incident_mean_cost, stddev)
log.log(
SUCCESS,
"New incident costs distribution successfully generated for '%s'.",
str(pairing),
)
return 1
def _generate_new_distributions(pairing: Tuple = (None, None)) -> Tuple:
"""(Re)generates the cost and likelihood distributions."""
gi.__init__()
log.info("Existing distributions deleted: %s", bool(gi.delete_distributions()))
successful_incidents_dists = 0
successful_costs_dists = 0
# If either size or industry is unspecified, gets all possible values.
sizes = gi.get_sizes() if pairing[0] is None else [pairing[0]]
industries = gi.get_industries() if pairing[1] is None else [pairing[1]]
# Attempts to generate new distributions for every combination of size and
# industry values.
for pair in list(itertools.product(sizes, industries)):
successful_incidents_dists += _generate_new_incident_frequency_distribution(
pair
)
successful_costs_dists += _generate_new_incident_costs_distribution(pair)
return successful_incidents_dists, successful_costs_dists
def main():
"""Called when the script is run from the command-line."""
# pylint: disable=global-statement
global OUTPUT_DIR, IMAGES
# pylint: enable=global-statement
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
"--size",
help="Specify the org. size (default: None)",
choices=["micro", "small", "medium", "large"],
type=str,
default=None,
)
parser.add_argument(
"-i",
"--industry",
help="Specify the org. industry SIC code (top-level only, e.g. C for "
"Manufacturing) (default: None)",
choices=list(map(chr, range(65, 86))),
type=chr,
default=None,
)
parser.add_argument(
"-o",
"--output",
help="Specify the output directory (default: ./output/)",
type=str,
default=os.path.join(os.path.dirname(__file__), "output/"),
metavar="DIRECTORY",
)
parser.add_argument(
"-p",
"--images",
help="Output images at each step of the script (default: false, just "
"output the final LEC image)",
action="store_true",
default=False,
)
parser.add_argument(
"-v",
"--verbose",
help="Verbose console output (default: false)",
action="store_true",
default=False,
)
parser.add_argument(
"-d",
"--debug",
help="Show debug console output (default: false)",
action="store_true",
default=False,
)
args = parser.parse_args()
OUTPUT_DIR = args.output
IMAGES = args.images
size = args.size
industry = args.industry
if args.debug:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
log.info("Debug output.")
elif args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
log.info("Verbose output.")
else:
log.basicConfig(format="%(levelname)s: %(message)s")
if not os.path.isdir(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
incidents_dists, costs_dists = _generate_new_distributions((size, industry))
log.log(
SUCCESS,
"Successfully generated %s incident frequency distributions and %s "
"incident costs distributions!",
str(incidents_dists),
str(costs_dists),
)
sys.exit(0)
def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
"""Apply titles and axis labels to a plot."""
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
def _save_plot(filename="untitled") -> None:
"""Save a plot and clear the figure."""
if IMAGES:
plt.savefig(OUTPUT_DIR + filename + ".png")
plt.clf()
if __name__ == "__main__":
main()