initial commit
This commit is contained in:
commit
51deec9859
14 changed files with 3059 additions and 0 deletions
696
src/graph.py
Normal file
696
src/graph.py
Normal file
|
@ -0,0 +1,696 @@
|
|||
"""
|
||||
Neo4j Graph Database Interface
|
||||
|
||||
This module defines:
|
||||
a) the interface for interacting with the Neo4j graph database; and
|
||||
b) subclasses of `Relationship`.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import logging as log
|
||||
from typing import List, Tuple, Union, Dict
|
||||
from datetime import datetime
|
||||
from py2neo import Graph, Node, NodeMatcher, Relationship, DatabaseError
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GraphInterface:
|
||||
"""
|
||||
An interface for the Neo4j graph database used to hold TI data.
|
||||
|
||||
This interface abstracts out the actual transactions, allowing a user
|
||||
to use more friendly methods without worrying about the implementation or
|
||||
learning the Cypher query language syntax.
|
||||
|
||||
This class should:
|
||||
a) determine the correct transactions to use based on the called
|
||||
method and any arguments;
|
||||
b) return only `Node`s, `Relationship`s, `SubGraph`s or lists thereof,
|
||||
so that the values can be assigned to subclasses of those at the
|
||||
point of calling; and
|
||||
c) deal with any `Exception`s, but not issues like returning 0 results,
|
||||
which should be dealt with at the point of calling.
|
||||
"""
|
||||
|
||||
g: Graph = None
|
||||
|
||||
@staticmethod
|
||||
def __init__():
|
||||
try:
|
||||
if GraphInterface.g is None:
|
||||
GraphInterface.g = Graph(password="test")
|
||||
log.info("Neo4j database connection opened successfully.")
|
||||
else:
|
||||
log.warning(
|
||||
"Neo4j database already connected - this branch "
|
||||
"shouldn't have been hit though!"
|
||||
)
|
||||
except DatabaseError:
|
||||
log.error("ERR: Neo4j database connection not successfully opened.")
|
||||
sys.exit()
|
||||
|
||||
@staticmethod
|
||||
def delete_distributions() -> bool:
|
||||
"""Deletes any pre-existing distributions."""
|
||||
GraphInterface.g.run(
|
||||
"MATCH (n) "
|
||||
"WHERE n:IncidentFrequencyDistribution OR n:IncidentCostsDistribution "
|
||||
"DETACH DELETE n;"
|
||||
)
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def get_incident_frequency_probabilities(
|
||||
boundaries, pairing: Tuple = ("All", "All")
|
||||
) -> List[float]:
|
||||
"""
|
||||
Attempts to get a list of probabilities for different annual incident
|
||||
frequencies, specific to the organisational details provided.
|
||||
|
||||
It first gets (the average of) any sets of base frequencies, then looks
|
||||
up the provided size/industry values to see if they have any assigned
|
||||
breach probability values in the graph database. If multiple values are
|
||||
found, the average is taken.
|
||||
|
||||
Once the specific base (i.e., >0) probability is found, it then recalculates
|
||||
the overall set of probabilities as proportions of that base figure.
|
||||
"""
|
||||
size = pairing[0]
|
||||
industry = pairing[1]
|
||||
|
||||
log.info(
|
||||
"Attempting to get breach frequency probabilities specific to ('%s', '%s')...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
|
||||
base_frequency_probabilities_nodes = GraphInterface._get_nodes(
|
||||
"IncidentBaseFrequencyProbabilities"
|
||||
)
|
||||
base_frequency_probabilities = [
|
||||
node["probabilities"]
|
||||
for node in base_frequency_probabilities_nodes
|
||||
if len(node["probabilities"]) == (len(boundaries) - 1)
|
||||
]
|
||||
|
||||
# If there are >1 sets of likelihoods, gets the mean for each boundary value.
|
||||
if len(base_frequency_probabilities) > 1:
|
||||
log.info("Multiple sets of base frequencies found, averaging...")
|
||||
base_frequency_probabilities = np.array(base_frequency_probabilities)
|
||||
base_frequency_probabilities = [
|
||||
np.mean(base_frequency_probabilities[:, i])
|
||||
for i in range(len(boundaries))
|
||||
]
|
||||
|
||||
probability_of_breach = GraphInterface.get_probability_of_breach(size, industry)
|
||||
if probability_of_breach:
|
||||
log.info(
|
||||
"Found specific >0 breaches probability value for one or both "
|
||||
"of ('%s', '%s'), calculating follow-on values...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
# Sets the probability of having 0 breaches.
|
||||
breach_frequency_probabilities = [(100 - probability_of_breach) / 100]
|
||||
|
||||
# Calculates the remaining probabilities proportional to the sum
|
||||
# >0 breaches probability.
|
||||
for base_frequency_probability in base_frequency_probabilities[0]:
|
||||
breach_frequency_probabilities.append(
|
||||
(probability_of_breach * base_frequency_probability) / 100
|
||||
)
|
||||
|
||||
if len(breach_frequency_probabilities) != len(boundaries):
|
||||
raise Exception("Mismatched boundaries!")
|
||||
|
||||
return breach_frequency_probabilities
|
||||
|
||||
log.info("No breach probability value found.")
|
||||
return None
|
||||
|
||||
# pylint: disable=too-many-branches,too-many-locals,too-many-statements
|
||||
@staticmethod
|
||||
def get_probability_of_breach(size="All", industry="All") -> float:
|
||||
"""
|
||||
Returns the probability of an organisation of a given size and/or
|
||||
industry experiencing a breach with an outcome in the next year.
|
||||
|
||||
Where a match exists for both size and industry, size is chosen as it
|
||||
assumed that organisations of a similar size will have a more similar
|
||||
threat model than organisations within the same industry. This assumption
|
||||
is not empirically grounded, however, so it may be that the opposite
|
||||
is true.
|
||||
"""
|
||||
size_probability = None
|
||||
industry_probability = None
|
||||
|
||||
size_node = GraphInterface._get_node("Size", name=size)
|
||||
if size_node:
|
||||
log.info("Found node for size '%s'.", size)
|
||||
else:
|
||||
log.info("No node found for size '%s'.", size)
|
||||
|
||||
industry_node = GraphInterface._get_node("Industry", name=industry)
|
||||
if industry_node:
|
||||
log.info("Found node for industry '%s'.", industry)
|
||||
else:
|
||||
log.info("No node found for industry '%s'.", industry)
|
||||
|
||||
# If no figures were found for this pairing, returns None.
|
||||
if size_node is None and industry_node is None:
|
||||
return None
|
||||
|
||||
if size_node:
|
||||
size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
|
||||
|
||||
size_probabilities = []
|
||||
for rel in size_relations:
|
||||
if rel.start_node.has_label("IncidentProbability"):
|
||||
size_probabilities.append(rel.start_node["probability"])
|
||||
|
||||
if len(size_probabilities) > 1:
|
||||
log.info(
|
||||
"Multiple probabilities found for size '%s', averaging...", size
|
||||
)
|
||||
size_probability = sum(size_probabilities) / len(size_probabilities)
|
||||
elif len(size_probabilities) == 1:
|
||||
log.info("Probability value found for size '%s'.", size)
|
||||
size_probability = size_probabilities[0]
|
||||
else:
|
||||
log.info("No probability value found for size '%s'.", size)
|
||||
|
||||
if industry_node:
|
||||
industry_relations = GraphInterface.g.match(
|
||||
{industry_node}, r_type=FOR_INDUSTRY
|
||||
)
|
||||
industry_probabilities = []
|
||||
for rel in industry_relations:
|
||||
if rel.start_node.has_label("IncidentProbability"):
|
||||
industry_probabilities.append(rel.start_node["probability"])
|
||||
|
||||
if len(industry_probabilities) > 1:
|
||||
log.info(
|
||||
"Multiple probabilities found for industry '%s', averaging...",
|
||||
industry,
|
||||
)
|
||||
industry_probability = sum(industry_probabilities) / len(
|
||||
industry_probabilities
|
||||
)
|
||||
elif len(industry_probabilities) == 1:
|
||||
log.info("Probability value found for industry '%s'.", industry)
|
||||
industry_probability = industry_probabilities[0]
|
||||
else:
|
||||
log.info("No probability value found for industry '%s'.", industry)
|
||||
|
||||
if size_probability and industry_probability:
|
||||
log.info(
|
||||
"Probabilities found for both size '%s' and industry '%s', averaging...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
probability = (size_probability + industry_probability) / 2
|
||||
else:
|
||||
probability = size_probability or industry_probability
|
||||
|
||||
return probability
|
||||
|
||||
# pylint: enable=too-many-branches,too-many-locals,too-many-statements
|
||||
|
||||
# pylint: disable=too-many-branches,too-many-locals,too-many-statements
|
||||
@staticmethod
|
||||
def get_incident_cost_averages(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Attempts to get the average incident costs over a year, specific to the
|
||||
organisational details provided.
|
||||
|
||||
The CSBS specifies figures for breaches both 'with' and 'without outcomes'.
|
||||
We have ignored the latter here.
|
||||
"""
|
||||
size = pairing[0]
|
||||
industry = pairing[1]
|
||||
|
||||
size_mean = None
|
||||
size_median = None
|
||||
industry_mean = None
|
||||
industry_median = None
|
||||
|
||||
log.info(
|
||||
"Attempting to get incident cost averages specific to ('%s', '%s')...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
|
||||
size_node = GraphInterface._get_node("Size", name=size)
|
||||
if size_node:
|
||||
log.info("Found node for size '%s'.", size)
|
||||
else:
|
||||
log.info("No node found for size '%s'.", size)
|
||||
|
||||
industry_node = GraphInterface._get_node("Industry", name=industry)
|
||||
if industry_node:
|
||||
log.info("Found node for industry '%s'.", industry)
|
||||
else:
|
||||
log.info("No node found for industry '%s'.", industry)
|
||||
|
||||
# If no figures were found for this pairing, returns None.
|
||||
if size_node is None and industry_node is None:
|
||||
return None
|
||||
|
||||
if size_node:
|
||||
size_relations = GraphInterface.g.match({size_node}, r_type=FOR_SIZE)
|
||||
|
||||
size_means = []
|
||||
size_medians = []
|
||||
for rel in size_relations:
|
||||
if rel.start_node.has_label("IncidentCostAverages"):
|
||||
size_means.append(rel.start_node["mean"])
|
||||
size_medians.append(rel.start_node["median"])
|
||||
|
||||
# Converts however many mean and median values returned into one of
|
||||
# each.
|
||||
if len(size_means) > 1:
|
||||
log.info("Multiple mean values found for size '%s', averaging...", size)
|
||||
size_mean = sum(size_means) / len(size_means)
|
||||
elif len(size_means) == 1:
|
||||
log.info("Mean value found for size '%s'.", size)
|
||||
size_mean = size_means[0]
|
||||
else:
|
||||
log.info("No mean values found for size '%s'.", size)
|
||||
if len(size_medians) > 1:
|
||||
log.info(
|
||||
"Multiple median values found for size '%s', averaging...", size
|
||||
)
|
||||
size_median = sum(size_medians) / len(size_medians)
|
||||
elif len(size_medians) == 1:
|
||||
log.info("Median value found for size '%s'.", size)
|
||||
size_median = size_medians[0]
|
||||
else:
|
||||
log.info("No median values found for size '%s'.", size)
|
||||
|
||||
if industry_node:
|
||||
industry_relations = GraphInterface.g.match(
|
||||
{industry_node}, r_type=FOR_INDUSTRY
|
||||
)
|
||||
|
||||
industry_means = []
|
||||
industry_medians = []
|
||||
for rel in industry_relations:
|
||||
if rel.start_node.has_label("IncidentCostAverages"):
|
||||
industry_means.append(rel.start_node["mean"])
|
||||
industry_medians.append(rel.start_node["median"])
|
||||
|
||||
# Converts however many mean and median values returned into one of
|
||||
# each.
|
||||
if len(industry_means) > 1:
|
||||
log.info(
|
||||
"Multiple mean values found for industry '%s', averaging...",
|
||||
industry,
|
||||
)
|
||||
industry_mean = sum(industry_means) / len(industry_means)
|
||||
elif len(industry_means) == 1:
|
||||
log.info("Mean value found for industry '%s'.", industry)
|
||||
industry_mean = industry_means[0]
|
||||
else:
|
||||
log.info("No mean values found for industry '%s'.", industry)
|
||||
if len(industry_medians) > 1:
|
||||
log.info(
|
||||
"Multiple median values found for industry '%s', averaging...",
|
||||
industry,
|
||||
)
|
||||
industry_median = sum(industry_medians) / len(industry_medians)
|
||||
elif len(industry_medians) == 1:
|
||||
log.info("Median value found for industry '%s'.", industry)
|
||||
industry_median = industry_medians[0]
|
||||
else:
|
||||
log.info("No median values found for industry '%s'.", industry)
|
||||
|
||||
if size_mean and industry_mean:
|
||||
log.info(
|
||||
"Mean values found for both size '%s' and industry '%s', averaging...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
mean = (size_mean + industry_mean) / 2
|
||||
else:
|
||||
mean = size_mean or industry_mean
|
||||
|
||||
if size_median and industry_median:
|
||||
log.info(
|
||||
"Median values found for both size '%s' and industry '%s', averaging...",
|
||||
size,
|
||||
industry,
|
||||
)
|
||||
median = (size_median + industry_median) / 2
|
||||
else:
|
||||
median = size_median or industry_median
|
||||
|
||||
return mean, median
|
||||
|
||||
# pylint: enable=too-many-branches,too-many-locals,too-many-statements
|
||||
|
||||
# pylint: disable=invalid-name,anomalous-backslash-in-string
|
||||
@staticmethod
|
||||
def get_incident_frequency_distribution(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Union[Tuple[float, float], None]:
|
||||
"""
|
||||
Returns the most relevant available incident frequency distribution for
|
||||
a given pairing.
|
||||
|
||||
The algorithm for determining this is currently very basic:
|
||||
|
||||
1. search for an exact match for the pairing, and return that if found; else
|
||||
2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
|
||||
|
||||
In future, this can and should be expanded to follow complex heuristics
|
||||
for similarity (and some relationships for doing so are provided at the
|
||||
end of this module). For example, two industries can be joined using the
|
||||
SIMILAR_TO relationship, which would allow the algorithm to traverse
|
||||
laterally to other leaf nodes.
|
||||
|
||||
An even simpler improvement would be to add handling for partial matches
|
||||
(e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
|
||||
should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
|
||||
organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
|
||||
values will be.
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
size = pairing[0]
|
||||
industry = pairing[1]
|
||||
|
||||
size_node = GraphInterface._get_node("Size", name=size)
|
||||
if size_node:
|
||||
log.info("Found node for size '%s'.", size)
|
||||
else:
|
||||
log.info("No node found for size '%s'.", size)
|
||||
|
||||
industry_node = GraphInterface._get_node("Industry", name=industry)
|
||||
if industry_node:
|
||||
log.info("Found node for industry '%s'.", industry)
|
||||
else:
|
||||
log.info("No node found for industry '%s'.", industry)
|
||||
|
||||
# If no figures were found for this pairing, returns the fallback values.
|
||||
if size_node is None and industry_node is None:
|
||||
return GraphInterface._get_frequency_distribution()
|
||||
|
||||
dist: Union[
|
||||
Dict[float, float], None
|
||||
] = GraphInterface._get_frequency_distribution(size, industry)
|
||||
|
||||
if dist is not None:
|
||||
log.debug(
|
||||
"Returned values are: a = %s, b = %s", str(dist["a"]), str(dist["b"])
|
||||
)
|
||||
|
||||
return dist
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
# pylint: disable=anomalous-backslash-in-string
|
||||
@staticmethod
|
||||
def get_incident_costs_distribution(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Union[Tuple[float, float], None]:
|
||||
"""
|
||||
Returns the most relevant available incident costs distribution for
|
||||
a given pairing.
|
||||
|
||||
The algorithm for determining this is currently very basic:
|
||||
|
||||
1. search for an exact match for the pairing, and return that if found; else
|
||||
2. return the distribution for :math:`\left(\text{All}, \text{All}\right)`.
|
||||
|
||||
In future, this can and should be expanded to follow complex heuristics
|
||||
for similarity (and some relationships for doing so are provided at the
|
||||
end of this module). For example, two industries can be joined using the
|
||||
SIMILAR_TO relationship, which would allow the algorithm to traverse
|
||||
laterally to other leaf nodes.
|
||||
|
||||
An even simpler improvement would be to add handling for partial matches
|
||||
(e.g., returning :math:`\left(\text{Micro}, \text{All}\right)`, which
|
||||
should be more relevant to a :math:`\left(\text{Micro}, \text{IT}\right)`
|
||||
organisation than the fallback :math:`\left(\text{All}, \text{All}\right)`
|
||||
values will be.
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
size = pairing[0]
|
||||
industry = pairing[1]
|
||||
|
||||
size_node = GraphInterface._get_node("Size", name=size)
|
||||
if size_node:
|
||||
log.info("Found node for size '%s'.", size)
|
||||
else:
|
||||
log.info("No node found for size '%s'.", size)
|
||||
|
||||
industry_node = GraphInterface._get_node("Industry", name=industry)
|
||||
if industry_node:
|
||||
log.info("Found node for industry '%s'.", industry)
|
||||
else:
|
||||
log.info("No node found for industry '%s'.", industry)
|
||||
|
||||
# If no figures were found for this pairing, returns the fallback values.
|
||||
if size_node is None and industry_node is None:
|
||||
return GraphInterface._get_costs_distribution()
|
||||
|
||||
dist: Union[Dict[float, float], None] = GraphInterface._get_costs_distribution(
|
||||
size, industry
|
||||
)
|
||||
|
||||
if dist is not None:
|
||||
log.debug(
|
||||
"Returned values are: mean = %s, stddev = %s",
|
||||
str(dist["mean"]),
|
||||
str(dist["stddev"]),
|
||||
)
|
||||
|
||||
return dist
|
||||
|
||||
@staticmethod
|
||||
def get_sizes() -> List[str]:
|
||||
"""Returns a list of all of the organisation size values."""
|
||||
nodes = GraphInterface._get_nodes("Size")
|
||||
|
||||
return [node["name"] for node in nodes]
|
||||
|
||||
@staticmethod
|
||||
def get_industries() -> List[str]:
|
||||
"""Returns a list of all of the organisation industry values."""
|
||||
nodes = GraphInterface._get_nodes("Industry")
|
||||
|
||||
return [node["name"] for node in nodes]
|
||||
|
||||
@staticmethod
|
||||
def get_sizes_and_industries() -> Tuple[list, list]:
|
||||
"""Returns all available organisation size and industry values."""
|
||||
return GraphInterface.get_sizes(), GraphInterface.get_industries()
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
@staticmethod
|
||||
def create_incident_frequency_distribution_node(
|
||||
pairing: Tuple, a: float, b: float
|
||||
) -> Node:
|
||||
"""Adds an `IncidentFrequencyDistribution` node to the Neo4j graph database."""
|
||||
size_node = GraphInterface._get_node("Size", name=pairing[0])
|
||||
industry_node = GraphInterface._get_node("Industry", name=pairing[1])
|
||||
|
||||
node = GraphInterface._create_node(
|
||||
"IncidentFrequencyDistribution", a=a, b=b, calculated_at=datetime.now()
|
||||
)
|
||||
GraphInterface._create_relationship(node, FOR_SIZE, size_node)
|
||||
GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
|
||||
return node
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
@staticmethod
|
||||
def create_incident_costs_distribution_node(
|
||||
pairing: Tuple, mean: float, stddev: float
|
||||
) -> Node:
|
||||
"""Adds an `IncidentCostsDistribution` node to the Neo4j graph database."""
|
||||
size_node = GraphInterface._get_node("Size", name=pairing[0])
|
||||
industry_node = GraphInterface._get_node("Industry", name=pairing[1])
|
||||
|
||||
node = GraphInterface._create_node(
|
||||
"IncidentCostsDistribution",
|
||||
mean=mean,
|
||||
stddev=stddev,
|
||||
calculated_at=datetime.now(),
|
||||
)
|
||||
GraphInterface._create_relationship(node, FOR_SIZE, size_node)
|
||||
GraphInterface._create_relationship(node, FOR_INDUSTRY, industry_node)
|
||||
return node
|
||||
|
||||
# pylint: disable=anomalous-backslash-in-string,invalid-name
|
||||
@staticmethod
|
||||
def _get_frequency_distribution(
|
||||
size: str = "All", industry: str = "All"
|
||||
) -> Dict[float, float]:
|
||||
"""
|
||||
Returns the :math:`a` and :math:`b` values from the requested incident
|
||||
frequency distribution node (if it exists). Call with no arguments to
|
||||
use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
result = GraphInterface.g.run(
|
||||
"MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentFrequencyDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
|
||||
"RETURN node;".format(size, industry)
|
||||
)
|
||||
# pylint: enable=line-too-long
|
||||
|
||||
nodes = [record["node"] for record in result]
|
||||
|
||||
if len(nodes) == 0:
|
||||
# There should always be a (All, All) distribution at least.
|
||||
if size == "All" and industry == "All":
|
||||
raise Exception("No fallback node found!")
|
||||
|
||||
log.debug(
|
||||
"No incident frequency distribution found for (%s, %s).",
|
||||
str(size),
|
||||
str(industry),
|
||||
)
|
||||
return None, None
|
||||
log.debug("Results: %s", str(nodes))
|
||||
|
||||
a = [node["a"] for node in nodes]
|
||||
b = [node["b"] for node in nodes]
|
||||
|
||||
if len(nodes) > 0:
|
||||
log.info("Multiple fallback nodes found, averaging parameters...")
|
||||
a = sum(a) / len(a)
|
||||
b = sum(b) / len(b)
|
||||
else:
|
||||
a = a[0]
|
||||
b = b[0]
|
||||
|
||||
return {"a": a, "b": b}
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
# pylint: disable=anomalous-backslash-in-string
|
||||
@staticmethod
|
||||
def _get_costs_distribution(
|
||||
size: str = "All", industry: str = "All"
|
||||
) -> Dict[float, float]:
|
||||
"""
|
||||
Returns the :math:`a` and :math:`b` values from the requested incident
|
||||
frequency distribution node (if it exists). Call with no arguments to
|
||||
use the fallback (:math:`\left(\text{All}, \text{All}\right)`) node.
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
result = GraphInterface.g.run(
|
||||
"MATCH (:Size {{name:'{}'}})<-[:FOR_SIZE]-(node:IncidentCostsDistribution)-[:FOR_INDUSTRY]->(:Industry {{name:'{}'}}) "
|
||||
"RETURN node;".format(size, industry)
|
||||
)
|
||||
# pylint: enable=line-too-long
|
||||
|
||||
nodes = [record["node"] for record in result]
|
||||
|
||||
if len(nodes) == 0:
|
||||
# There should always be a (All, All) distribution at least.
|
||||
if size == "All" and industry == "All":
|
||||
raise Exception("No fallback node found!")
|
||||
|
||||
log.debug(
|
||||
"No incident frequency distribution found for (%s, %s).",
|
||||
str(size),
|
||||
str(industry),
|
||||
)
|
||||
return None, None
|
||||
log.debug("Results: %s", str(nodes))
|
||||
|
||||
mean = [node["mean"] for node in nodes]
|
||||
stddev = [node["stddev"] for node in nodes]
|
||||
|
||||
if len(nodes) > 1:
|
||||
log.info("Multiple fallback nodes found, averaging parameters...")
|
||||
mean = sum(mean) / len(mean)
|
||||
stddev = sum(stddev) / len(stddev)
|
||||
else:
|
||||
mean = mean[0]
|
||||
stddev = stddev[0]
|
||||
|
||||
return {"mean": mean, "stddev": stddev}
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
@staticmethod
|
||||
def _create_node(*labels, **properties) -> Node:
|
||||
"""Creates a new node in the Neo4j graph database."""
|
||||
tx = GraphInterface.g.begin()
|
||||
node = Node(*labels, **properties)
|
||||
tx.create(node)
|
||||
tx.commit()
|
||||
return node
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
@staticmethod
|
||||
def _create_relationship(
|
||||
start_node, relationship, end_node, **properties
|
||||
) -> Relationship:
|
||||
"""Creates a new relationship in the Neo4j graph database."""
|
||||
tx = GraphInterface.g.begin()
|
||||
relationship = Relationship(
|
||||
start_node, relationship.__name__, end_node, **properties
|
||||
)
|
||||
tx.create(relationship)
|
||||
tx.commit()
|
||||
return relationship
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
@staticmethod
|
||||
def _get_node(*labels, **properties) -> Union[Node, None]:
|
||||
"""Returns a node from the Neo4j graph database."""
|
||||
return GraphInterface.g.nodes.match(*labels, **properties).first()
|
||||
|
||||
@staticmethod
|
||||
def _get_nodes(*labels, **properties) -> NodeMatcher:
|
||||
"""Returns a node from the Neo4j graph database."""
|
||||
return GraphInterface.g.nodes.match(*labels, **properties)
|
||||
|
||||
@staticmethod
|
||||
def _dict_to_jsobj(properties) -> str:
|
||||
"""Recursively converts a Python `dict` into a JS `Object`."""
|
||||
if isinstance(properties, dict):
|
||||
return re.sub("'([a-z_]*)':", "\\1:", str(properties))
|
||||
|
||||
if isinstance(properties, str):
|
||||
return GraphInterface._dict_to_jsobj({"name": properties})
|
||||
|
||||
return "{}"
|
||||
|
||||
|
||||
# pylint: disable=invalid-name,missing-class-docstring
|
||||
class SUBSECTION_OF(Relationship):
|
||||
pass
|
||||
|
||||
|
||||
class SECTION_OF(Relationship):
|
||||
pass
|
||||
|
||||
|
||||
class SIMILAR_TO(Relationship):
|
||||
pass
|
||||
|
||||
|
||||
class FOR_SIZE(Relationship):
|
||||
pass
|
||||
|
||||
|
||||
class FOR_INDUSTRY(Relationship):
|
||||
pass
|
||||
|
||||
|
||||
# pylint: enable=invalid-name,missing-class-docstring
|
420
src/montecarlo.py
Normal file
420
src/montecarlo.py
Normal file
|
@ -0,0 +1,420 @@
|
|||
"""
|
||||
Monte Carlo Simulation Script
|
||||
|
||||
This script runs a Monte Carlo simulation for an organisation of a given
|
||||
size and industry, utilising the most relevant available available.
|
||||
|
||||
Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import pickle
|
||||
import logging as log
|
||||
|
||||
from typing import Tuple, Dict, Union
|
||||
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from graph import GraphInterface as gi
|
||||
|
||||
# Used for logging, equivalent to `logging.INFO`.
|
||||
SUCCESS = 20
|
||||
|
||||
# If not specified, the default number of Monte Carlo simulation runs to perform.
|
||||
DEFAULT_RUNS = 5000
|
||||
|
||||
# The arbitrary maximum number of incidents that an organisation can experience
|
||||
# in a year.
|
||||
MAX_ANNUAL_INCIDENTS = 8000
|
||||
|
||||
# The maximum value of a company; any yearly losses over result in a bankruptcy
|
||||
COMPANY_VALUE = 100000
|
||||
|
||||
# A smaller value increases the curviness of the loss exeedence curve.
|
||||
# Less than 30 starts to get a bit steppy though.
|
||||
LEC_PRECISION = math.floor(COMPANY_VALUE / 30)
|
||||
|
||||
# Quantifies the quantitative boundaries for human-readable incident frequencies,
|
||||
# which many sources (e.g., the CSBS 2020) use to present their results.
|
||||
#
|
||||
# 'None' = 0
|
||||
# 'Annually' = 1
|
||||
# 'Less than monthly' = 2–7
|
||||
# 'Monthly' = 8–17
|
||||
# 'Weekly' = 18–79
|
||||
# 'Daily' = 80–399
|
||||
# 'More than daily' = 400–8000
|
||||
BOUNDARIES = {
|
||||
"None": 0,
|
||||
"Once per year": 1,
|
||||
"Less than once a month": 2,
|
||||
"Once a month": 8,
|
||||
"Once a week": 18,
|
||||
"Once a day": 80,
|
||||
"Several times a day": 400,
|
||||
"MAX": MAX_ANNUAL_INCIDENTS,
|
||||
}
|
||||
|
||||
N = None
|
||||
OUTPUT_DIR = None
|
||||
IMAGES = None
|
||||
FORCE = None
|
||||
|
||||
|
||||
def _calculate_num_of_incidents(incidents_dist: Dict[float, float]) -> float:
|
||||
"""Calculate how many incidents have occurred in a given year."""
|
||||
|
||||
log.debug("Incident distribution: %s", str(incidents_dist))
|
||||
|
||||
num_of_incidents = incidents_dist["b"] / (1 - np.random.uniform()) ** (
|
||||
1 / incidents_dist["a"]
|
||||
)
|
||||
log.debug("Number of incidents (as `int`): %s", str(int(num_of_incidents)))
|
||||
|
||||
return (
|
||||
int(num_of_incidents)
|
||||
if num_of_incidents <= MAX_ANNUAL_INCIDENTS
|
||||
else MAX_ANNUAL_INCIDENTS
|
||||
)
|
||||
|
||||
|
||||
def _calculate_sum_cost_of_incidents(
|
||||
num_of_incidents: int, costs_dist: Dict[float, float], idx: int = None
|
||||
) -> float:
|
||||
"""For a list of incident numbers, calculate how much each breach cost and
|
||||
return the sum."""
|
||||
|
||||
log.debug("Costs distribution: %s", str(costs_dist))
|
||||
|
||||
if (N < 1000) or (N >= 1000 and idx % math.floor(N / 100) == 0):
|
||||
log.info(
|
||||
"Running Monte Carlo simulation... (%s/%s iterations)", str(idx), str(N)
|
||||
)
|
||||
|
||||
if num_of_incidents == 0:
|
||||
return 0
|
||||
|
||||
loc = np.log(
|
||||
costs_dist["mean"] ** 2
|
||||
/ np.sqrt(costs_dist["stddev"] ** 2 + costs_dist["mean"] ** 2)
|
||||
)
|
||||
shape = np.sqrt(np.log(1 + (costs_dist["stddev"] ** 2 / costs_dist["mean"] ** 2)))
|
||||
|
||||
costs = [random.lognormvariate(loc, shape) for r in range(num_of_incidents)]
|
||||
|
||||
return sum(costs)
|
||||
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
def _get_most_relevant_incident_frequency_distribution(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Union[Dict[float, float], None]:
|
||||
"""Gets the distribution for incident frequency from the data in the Neo4j
|
||||
graph database."""
|
||||
|
||||
log.info(
|
||||
"Finding most relevant incident frequency distribution for %s...", str(pairing)
|
||||
)
|
||||
return gi.get_incident_frequency_distribution(pairing)
|
||||
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
|
||||
def _get_most_relevant_incident_costs_distribution(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Union[Dict[float, float], None]:
|
||||
"""Gets the distribution for incident costs from the data in the Neo4j
|
||||
graph database."""
|
||||
|
||||
log.info(
|
||||
"Finding most relevant incident costs distribution for %s...", str(pairing)
|
||||
)
|
||||
return gi.get_incident_costs_distribution(pairing)
|
||||
|
||||
|
||||
def _get_most_relevant_distributions(
|
||||
pairing: Tuple = ("All", "All")
|
||||
) -> Dict[Union[Dict[float, float], None], Union[Dict[float, float], None]]:
|
||||
"""Generate (or retrieve) a population of annual incident quantities and a
|
||||
distribution of incident-with-outcome cost values."""
|
||||
|
||||
# -- caching --
|
||||
# Retrieves previously-calculated values if possible
|
||||
if not FORCE and OUTPUT_DIR is not None:
|
||||
try:
|
||||
filename = "{}-{}.pickle".format(pairing[0], pairing[1])
|
||||
dists = pickle.load(open(OUTPUT_DIR + filename, "rb"))
|
||||
|
||||
log.info("Previously-calculated distributions found")
|
||||
return dists["incidents"], dists["costs"]
|
||||
except (OSError, IOError):
|
||||
log.info("Previously-calculated distributions not found")
|
||||
|
||||
# Otherwise, generates fresh ones
|
||||
gi.__init__()
|
||||
|
||||
incidents_dist = _get_most_relevant_incident_frequency_distribution(pairing)
|
||||
costs_dist = _get_most_relevant_incident_costs_distribution(pairing)
|
||||
|
||||
log.debug(
|
||||
"Returned values are: incidents_dist = %s, costs_dist = %s",
|
||||
str(incidents_dist),
|
||||
str(costs_dist),
|
||||
)
|
||||
|
||||
# Saves the figures for faster analysis in future
|
||||
if OUTPUT_DIR is not None and incidents_dist is not None and costs_dist is not None:
|
||||
dists = {
|
||||
"incidents": incidents_dist,
|
||||
"costs": costs_dist,
|
||||
}
|
||||
filename = "{}-{}.pickle".format(pairing[0], pairing[1])
|
||||
pickle.dump(dists, open(OUTPUT_DIR + filename, "wb"))
|
||||
|
||||
return incidents_dist, costs_dist
|
||||
|
||||
|
||||
# pylint: disable=anomalous-backslash-in-string
|
||||
def _run_monte_carlo_simulation(pairing: Tuple = ("All", "All")) -> None:
|
||||
"""
|
||||
Runs :math:`n` simulations of a 12-month period, calculating the number
|
||||
of incidents encountered each time and their cumulative costs.
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
# Generates both distributions
|
||||
incidents_dist, costs_dist = _get_most_relevant_distributions(pairing)
|
||||
|
||||
if incidents_dist is None and costs_dist is None:
|
||||
return incidents_dist, costs_dist
|
||||
|
||||
# Calculates the number of incidents suffered over $n$ simulated years
|
||||
nums_of_incidents = np.array(
|
||||
[_calculate_num_of_incidents(incidents_dist) for i in range(N)]
|
||||
)
|
||||
log.debug("Number of incidents: %s", str(nums_of_incidents))
|
||||
|
||||
_label_plot(
|
||||
"Histogram of Incident Frequencies (over 12 months)",
|
||||
"Number of Incidents ($log_{10}$)",
|
||||
"Frequency",
|
||||
)
|
||||
plt.hist(
|
||||
[np.log10(i) if i > 0 else 0 for i in nums_of_incidents],
|
||||
align="left",
|
||||
bins=range(12),
|
||||
)
|
||||
_save_plot("2 - histogram of incident frequencies")
|
||||
|
||||
# Calculates the annual costs for each simulated year
|
||||
log.info("Running Monte Carlo simulation... (0/%s iterations)", str(N))
|
||||
sum_costs = [
|
||||
_calculate_sum_cost_of_incidents(num_of_incidents, costs_dist, idx)
|
||||
for idx, num_of_incidents in enumerate(nums_of_incidents, start=1)
|
||||
]
|
||||
log.info("Running Monte Carlo simulation... (%s/%s iterations)", str(N), str(N))
|
||||
|
||||
_label_plot(
|
||||
"Histogram of Sum Costs (over 12 months)", "Total Cost (£)", "Frequency"
|
||||
)
|
||||
plt.ticklabel_format(style="plain")
|
||||
plt.hist(sum_costs, align="left", bins=15, range=(0, COMPANY_VALUE))
|
||||
_save_plot("4 - histogram of sum costs")
|
||||
|
||||
_label_plot("Density of Sum Costs (over 12 months)", "Total Cost (£)", "Density")
|
||||
pd.Series(sum_costs).plot(kind="density")
|
||||
plt.xlim(0, COMPANY_VALUE * 2)
|
||||
plt.ticklabel_format(style="plain")
|
||||
_save_plot("5 - density of sum costs")
|
||||
|
||||
# Get loss exceedance curve
|
||||
log.info("Generating loss exceedance curve")
|
||||
|
||||
hist, edges = np.histogram(sum_costs, bins=LEC_PRECISION)
|
||||
cumrev = np.cumsum(hist[::-1])[::-1] * 100 / len(sum_costs)
|
||||
|
||||
_label_plot(
|
||||
"Loss Exceedance Curve (Monte Carlo sim)",
|
||||
"Loss (£, 99th percentile)",
|
||||
"Chance of Loss or Greater (%)",
|
||||
)
|
||||
plt.ticklabel_format(style="plain")
|
||||
plt.xlim(0, COMPANY_VALUE)
|
||||
plt.plot(edges[:-1], cumrev)
|
||||
_save_plot("6 - lec" if IMAGES else "lec")
|
||||
|
||||
log.info("Simulation complete!")
|
||||
|
||||
return nums_of_incidents, sum_costs
|
||||
|
||||
|
||||
def main():
|
||||
"""Called when the script is run from the command-line"""
|
||||
# pylint: disable=global-statement
|
||||
global N, OUTPUT_DIR, IMAGES, FORCE
|
||||
# pylint: enable=global-statement
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--number",
|
||||
help="The number of simulations to run (default: " + str(DEFAULT_RUNS) + ")",
|
||||
type=int,
|
||||
default=DEFAULT_RUNS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--size",
|
||||
help="The size of the organisation to simulate (default: all)",
|
||||
type=str,
|
||||
default="All",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--industry",
|
||||
help="The industry of the organisation to simulate (default: all)",
|
||||
type=str,
|
||||
default="All",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Specify the output directory (default: ./output/)",
|
||||
type=str,
|
||||
default=os.path.join(os.path.dirname(__file__), "output/"),
|
||||
metavar="DIRECTORY",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--images",
|
||||
help="Output images at each step of the script (default: false, just \
|
||||
output the final LEC image)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
help="Force re-generation of incident and cost distributions (default: false)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
help="Verbose console output (default: false)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
help="Show debug console output (default: false)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
N = args.number
|
||||
OUTPUT_DIR = args.output
|
||||
IMAGES = args.images
|
||||
FORCE = args.force
|
||||
|
||||
size = args.size
|
||||
industry = args.industry
|
||||
|
||||
if args.debug:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
|
||||
log.info("Debug output.")
|
||||
elif args.verbose:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
|
||||
log.info("Verbose output.")
|
||||
else:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s")
|
||||
|
||||
if not os.path.isdir(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
if size or industry:
|
||||
print("Running simulation for ({}, {})".format(size, industry))
|
||||
nums_of_incidents, sum_costs = _run_monte_carlo_simulation((size, industry))
|
||||
if nums_of_incidents is not None and sum_costs is not None:
|
||||
log.info(
|
||||
"Results:\nNumbers of incidents: %s\nSum costs: %s\n",
|
||||
str(nums_of_incidents),
|
||||
str(sum_costs),
|
||||
)
|
||||
|
||||
avg_num_of_incidents = int(sum(nums_of_incidents) / len(nums_of_incidents))
|
||||
avg_sum_costs = sum(sum_costs) / len(sum_costs)
|
||||
log.log(
|
||||
SUCCESS,
|
||||
"Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
|
||||
avg_num_of_incidents,
|
||||
avg_sum_costs,
|
||||
)
|
||||
|
||||
# Print output that will be picked up by game server.
|
||||
# pylint: disable=fixme
|
||||
# TODO: For some reason the results at the moment are orders of magnitude
|
||||
# too high, so for now I've plugged it by dividing both results by 100.
|
||||
# pylint: enable=fixme
|
||||
print(int(avg_num_of_incidents / 100))
|
||||
print("%.2f" % (avg_sum_costs / 100))
|
||||
else:
|
||||
log.warning("No data found.")
|
||||
print("No data found.")
|
||||
|
||||
print("Running simulation for (All, All)")
|
||||
gen_nums_of_incidents, gen_sum_costs = _run_monte_carlo_simulation()
|
||||
log.info(
|
||||
"Results:\nNumbers of incidents: %s\nSum costs: %s\n",
|
||||
str(gen_nums_of_incidents),
|
||||
str(gen_sum_costs),
|
||||
)
|
||||
|
||||
avg_gen_num_of_incidents = int(
|
||||
sum(gen_nums_of_incidents) / len(gen_nums_of_incidents)
|
||||
)
|
||||
avg_gen_sum_costs = sum(gen_sum_costs) / len(gen_sum_costs)
|
||||
log.log(
|
||||
SUCCESS,
|
||||
"Results:\nAverage number of incidents: %d\nAverage cost: £%.2f",
|
||||
avg_gen_num_of_incidents,
|
||||
avg_gen_sum_costs,
|
||||
)
|
||||
|
||||
# Print output that will be picked up by the game server.
|
||||
print(int(avg_gen_num_of_incidents / 100))
|
||||
print("%.2f" % (avg_gen_sum_costs / 100))
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
|
||||
"""Apply titles and axis labels to a plot."""
|
||||
|
||||
plt.title(title)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
|
||||
|
||||
def _save_plot(filename="untitled") -> None:
|
||||
"""Save a plot and clear the figure."""
|
||||
|
||||
if IMAGES:
|
||||
plt.savefig(OUTPUT_DIR + filename + ".png")
|
||||
plt.clf()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
103
src/montecarlo.r
Normal file
103
src/montecarlo.r
Normal file
|
@ -0,0 +1,103 @@
|
|||
#
|
||||
# Secure Digitalisation Monte Carlo Simulation Script
|
||||
#
|
||||
# This script runs a Monto Carlo simulation using breach likelihood and cost
|
||||
# figures derived from the Cyber Security Breaches Survey 2020 (CSBS).
|
||||
# This script is an unfinished prototype, and has since been superseded by
|
||||
# `montecarlo.py`.
|
||||
#
|
||||
# Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
|
||||
#
|
||||
|
||||
masses = c(0.54, 0.1058, 0.1012, 0.0966, 0.069, 0.0368, 0.0414)
|
||||
boundaries = c(1, 2, 8, 18, 80, 400, 8000)
|
||||
|
||||
Fs = cumsum(masses)
|
||||
plot(log(boundaries), log(1 - Fs))
|
||||
|
||||
xs = log(boundaries)
|
||||
ys = log(1 - Fs)
|
||||
fit = lm(ys ~ xs)
|
||||
summary(fit)
|
||||
|
||||
alogb = fit$coeff[1]
|
||||
a = -fit$coeff[2]
|
||||
b = exp(alogb/a)
|
||||
print(a)
|
||||
print(b)
|
||||
|
||||
n = 10000
|
||||
|
||||
us = runif(n)
|
||||
xs = b / (1 - us)^(1 / a)
|
||||
print()
|
||||
p0 = mean(xs < boundaries[1])
|
||||
p1 = mean(xs < boundaries[2]) - p0
|
||||
p2 = mean(xs < boundaries[3]) - p0 - p1
|
||||
p3 = mean(xs < boundaries[4]) - p0 - p1 - p2
|
||||
p4 = mean(xs < boundaries[5]) - p0 - p1 - p2 - p3
|
||||
p5 = mean(xs < boundaries[6]) - p0 - p1 - p2 - p3 - p4
|
||||
ps = c(p0, p1, p2, p3, p4, p5, 1 - (p0 + p1 + p2 + p3 + p4 + p5))
|
||||
|
||||
print(ps)
|
||||
print(masses)
|
||||
|
||||
nattacks = floor(xs)
|
||||
hist(log10(nattacks),
|
||||
main = "Histogram of Number of Attacks/Breaches Over 12 Months",
|
||||
xlab = expression("Number of Attacks (log"[10]*")"),
|
||||
ylab = "Frequency",
|
||||
breaks = 0:12)
|
||||
|
||||
# Plots the distribution for the average cost of breach(es) over 12 months
|
||||
|
||||
mean = 3230
|
||||
median = 274
|
||||
|
||||
logstd = sqrt(2 * (log(mean) - if (median == 0) 0 else log(median)))
|
||||
std = exp(1)^logstd
|
||||
|
||||
curve(dlnorm(x, log(mean), log(std)), from=1, to=5000,
|
||||
main = "Average annual breach cost distribution",
|
||||
xlab = 'Cost (£)',
|
||||
ylab = 'Density',
|
||||
lwd = 2)
|
||||
|
||||
# Runs the MonteCarlo simulation
|
||||
|
||||
simulateCosts <- function(n) {
|
||||
return(if (n >= 1) sum(rlnorm(n, loc, shape)) else 0)
|
||||
}
|
||||
|
||||
n = 10000
|
||||
|
||||
loc <- log(mean^2 / sqrt(std^2 + mean^2))
|
||||
shape <- sqrt(log(1 + (std^2 / mean^2)))
|
||||
|
||||
numAttacks <- sample(log10(nattacks), n)
|
||||
results <- sapply(numAttacks, simulateCosts)
|
||||
|
||||
hist(results,
|
||||
main = "Histogram of Total Costs Over 12 Months (Monte Carlo sim)",
|
||||
xlab = "Total cost (£)")
|
||||
|
||||
d <- density(results)
|
||||
plot(d,
|
||||
main="Density of Total Costs Over 12 Months (Monte Carlo sim)",
|
||||
xlab=expression("Total Cost (£)"),
|
||||
ylab="Density")
|
||||
|
||||
# Get loss exceedance
|
||||
# TODO: needs to be prettier, but `evaluate::loss_exceedance_curve()` is broken
|
||||
|
||||
maxValue = 2500
|
||||
numOver <- length(results[results > maxValue])
|
||||
risk = numOver/n
|
||||
|
||||
plot(d,
|
||||
main="Loss Exceedance (Monte Carlo sim)",
|
||||
xlab=expression("Total Cost (£)"),
|
||||
ylab="Density")
|
||||
|
||||
abline(v = maxValue, col="red", lwd=3, lty=2)
|
||||
text(3000, 4e-04, labels=paste(floor(risk*100), "% chance of ≥£", maxValue, " losses"), adj=c(0, 0.5))
|
78
src/neo4r_connection_test.r
Normal file
78
src/neo4r_connection_test.r
Normal file
|
@ -0,0 +1,78 @@
|
|||
#
|
||||
# Secure Digitalisation Neo4j Connection Script
|
||||
#
|
||||
# This script is intended to establish a connection to a Neo4j graph database
|
||||
# and submit commands.
|
||||
# This script is an unfinished prototype, and has since been superseded by
|
||||
# `graph.py`.
|
||||
#
|
||||
|
||||
install.packages('tidyverse')
|
||||
library(tidyverse)
|
||||
install.packages('purrr')
|
||||
library(purrr)
|
||||
install.packages('devtools')
|
||||
library(devtools)
|
||||
install_github("davidlrosenblum/neo4r@4.x")
|
||||
library(neo4r)
|
||||
|
||||
RUNS <- 1000
|
||||
DECISION.STEPS <- 12
|
||||
|
||||
get_likelihood <- function() {
|
||||
res <- 'MATCH (i:Incident) WHERE EXISTS (i.probability) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.probability AS probability;' %>%
|
||||
call_neo4j(con, type = 'row')
|
||||
|
||||
res$probability / 100
|
||||
}
|
||||
|
||||
# Currently only does direct costs
|
||||
get_costs <- function() {
|
||||
res <- 'MATCH (i:Incident) WHERE EXISTS (i.direct_costs) AND NOT (i)-[:FOR_SIZE]-() AND NOT (i)-[:FOR_INDUSTRY]-() AND NOT (i)-[:FOR_AREA]-() RETURN i.direct_costs[0] AS cost;' %>%
|
||||
call_neo4j(con, type = 'row')
|
||||
|
||||
res$cost
|
||||
}
|
||||
|
||||
calculate_cost <- function(alpha) {
|
||||
l <- get_likelihood()
|
||||
happen <- runif(1, 0, 1)
|
||||
if (happen >= l) {
|
||||
cost <- as.numeric(get_costs())
|
||||
s <- log(sd(580:630))
|
||||
m <- log(get_costs())
|
||||
#location <- log(m^2 / sqrt(s^2 + m^2))
|
||||
#shape <- sqrt(log(1 + (s^2 / m^2)))
|
||||
rlnorm(1, )
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
con <- neo4j_api$new(
|
||||
url="http://localhost:7474",
|
||||
db="neo4j",
|
||||
user="neo4j",
|
||||
password="password"
|
||||
)
|
||||
|
||||
simulations <- rerun(RUNS, replicate(DECISION.STEPS, runif(1) %>% calculate_cost())) %>%
|
||||
set_names(paste0("sim", 1:RUNS)) %>%
|
||||
map(~ accumulate(., ~ .x * .y)) %>%
|
||||
map_dfr(~ tibble(value = .x, step = 1:DECISION.STEPS), .id = "simulation")
|
||||
|
||||
simulations %>%
|
||||
ggplot(aes(x = step, y = value)) +
|
||||
geom_line(aes(color = simulation)) +
|
||||
theme(legend.position = "none") +
|
||||
ggtitle("Simulations of costs from breaches")
|
||||
|
||||
summary_values <- simulations %>%
|
||||
group_by(step) %>%
|
||||
summarise(mean_return = mean(value), max_return = max(value), min_return = min(value)) %>%
|
||||
gather("series", "value", -step)
|
||||
|
||||
summary_values %>%
|
||||
ggplot(aes(x = step, y = value)) +
|
||||
geom_line(aes(color = series)) +
|
||||
ggtitle("Mean values from simulations")
|
332
src/regenerate_distributions.py
Normal file
332
src/regenerate_distributions.py
Normal file
|
@ -0,0 +1,332 @@
|
|||
"""
|
||||
Distributions (Re)generation Script
|
||||
|
||||
This script generates likelihood and cost distributions based on threat
|
||||
intelligence data stored in a connected Neo4j graph database. It attempts to
|
||||
do so for every possible permutation of (size, industry) values.
|
||||
|
||||
These are then consumed by `montecarlo.py`, which runs a Monte Carlo
|
||||
simulation based on these figures.
|
||||
|
||||
Acknowledgements: Dr Dan Prince & Dr Chris Sherlock
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import warnings
|
||||
import logging as log
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.formula.api as smf
|
||||
from matplotlib import pyplot as plt
|
||||
from scipy.stats import lognorm
|
||||
|
||||
from graph import GraphInterface as gi
|
||||
|
||||
# Used for logging, equivalent to `logging.WARNING` + 1.
|
||||
SUCCESS = 31
|
||||
|
||||
# The arbitrary maximum number of incidents that an organisation can experience
|
||||
# in a year.
|
||||
MAX_ANNUAL_INCIDENTS = 8000
|
||||
|
||||
# Quantifies the quantitative boundaries for human-readable incident frequencies,
|
||||
# which many sources (e.g., the CSBS 2020) use to present their results.
|
||||
#
|
||||
# 'None' = 0
|
||||
# 'Annually' = 1
|
||||
# 'Less than monthly' = 2–7
|
||||
# 'Monthly' = 8–17
|
||||
# 'Weekly' = 18–79
|
||||
# 'Daily' = 80–399
|
||||
# 'More than daily' = 400–8000
|
||||
BOUNDARIES = {
|
||||
"None": 0,
|
||||
"Once per year": 1,
|
||||
"Less than once a month": 2,
|
||||
"Once a month": 8,
|
||||
"Once a week": 18,
|
||||
"Once a day": 80,
|
||||
"Several times a day": 400,
|
||||
"MAX": MAX_ANNUAL_INCIDENTS,
|
||||
}
|
||||
|
||||
OUTPUT_DIR = None
|
||||
IMAGES = None
|
||||
|
||||
# pylint: disable=invalid-name,anomalous-backslash-in-string
|
||||
def _generate_new_incident_frequency_distribution(pairing: Tuple = (None, None)) -> int:
|
||||
"""
|
||||
Generates a new incident frequency distribution.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
(Re)generates the incident frequency distribution for a
|
||||
:math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
|
||||
a Neo4j graph database.
|
||||
|
||||
Currently this only produces log-normal distributions. Additional types of
|
||||
distribution can be implemented by overloading this method (by importing the
|
||||
`multipledispatch` package) and returning the values required for defining
|
||||
that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
|
||||
and :math:`b`).
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
log.info("Generating new incident frequency distribution for '%s'...", str(pairing))
|
||||
|
||||
# Attempts to get the incident probabilities for the pairing from the graph
|
||||
# database
|
||||
incident_frequency_probabilities = gi.get_incident_frequency_probabilities(
|
||||
list(BOUNDARIES.values())[:-1], pairing
|
||||
)
|
||||
if incident_frequency_probabilities is None:
|
||||
log.info(
|
||||
"No incident frequency distribution generated for '%s'.",
|
||||
str(pairing),
|
||||
)
|
||||
return 0
|
||||
|
||||
log.debug(
|
||||
"Returned values are: incident frequency probabilities = %s",
|
||||
str(incident_frequency_probabilities),
|
||||
)
|
||||
|
||||
# If values are found, generate a distribution
|
||||
Fs = np.cumsum(incident_frequency_probabilities)
|
||||
|
||||
xs = np.log(list(BOUNDARIES.values())[1:])
|
||||
ys = np.log(1 - Fs)
|
||||
data = pd.DataFrame(xs, ys)
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
# See <https://www.statsmodels.org/stable/_modules/statsmodels/stats/stattools.html#omni_normtest> for explanation
|
||||
# pylint: enable=line-too-long
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
fit = smf.ols(formula="ys ~ xs", data=data).fit()
|
||||
log.debug(fit.summary())
|
||||
|
||||
# Get the parameters for the generated distribution and store them in the
|
||||
# graph database.
|
||||
alogb = fit.params[0]
|
||||
a = -fit.params[1]
|
||||
b = np.exp(alogb / a)
|
||||
|
||||
gi.create_incident_frequency_distribution_node(pairing, a, b)
|
||||
|
||||
log.log(
|
||||
SUCCESS,
|
||||
"New incident frequency distribution successfully generated for '%s'.",
|
||||
str(pairing),
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
# pylint: enable=invalid-name
|
||||
|
||||
# pylint: disable=anomalous-backslash-in-string
|
||||
def _generate_new_incident_costs_distribution(pairing: Tuple = (None, None)) -> int:
|
||||
"""
|
||||
(Re)generates the incident cost distribution for a
|
||||
:math:`\left(\text{size}, \text{industry}\right)` pairing from the data in
|
||||
a Neo4j graph database.
|
||||
|
||||
Currently this only produces log-normal distributions. Additional types of
|
||||
distribution can be implemented by overloading this method (by importing the
|
||||
`multipledispatch` package) and returning the values required for defining
|
||||
that distribution (e.g., :math:`\mu` and :math:`\sigma` instead of :math:`a`
|
||||
and :math:`b`).
|
||||
"""
|
||||
# pylint: enable=anomalous-backslash-in-string
|
||||
|
||||
# Plots the distribution for the average cost of incident(s) over 12 months
|
||||
log.info("Generating new incident cost distribution for '%s'...", str(pairing))
|
||||
|
||||
incident_mean_cost, incident_median_cost = gi.get_incident_cost_averages(pairing)
|
||||
if incident_mean_cost is None or incident_median_cost is None:
|
||||
log.info(
|
||||
"No incident costs distribution generated for '%s'.",
|
||||
str(pairing),
|
||||
)
|
||||
return 0
|
||||
|
||||
log.debug(
|
||||
"Returned values are: mean = %s, median = %s",
|
||||
str(incident_mean_cost),
|
||||
str(incident_median_cost),
|
||||
)
|
||||
|
||||
log_stddev = np.sqrt(
|
||||
2
|
||||
* (
|
||||
np.log(incident_mean_cost) - 0
|
||||
if (incident_median_cost == 0)
|
||||
else np.log(incident_median_cost)
|
||||
)
|
||||
)
|
||||
stddev = np.exp(1) ** log_stddev
|
||||
|
||||
_label_plot(
|
||||
"Average annual incident-with-outcome cost distribution", "Cost (£)", "Density"
|
||||
)
|
||||
plt.plot(
|
||||
[
|
||||
lognorm.pdf(
|
||||
np.log(i),
|
||||
np.log(incident_mean_cost),
|
||||
np.log(incident_median_cost) if incident_median_cost > 0 else 0,
|
||||
)
|
||||
for i in range(1, 2500)
|
||||
]
|
||||
)
|
||||
_save_plot("3 - cost dist")
|
||||
|
||||
gi.create_incident_costs_distribution_node(pairing, incident_mean_cost, stddev)
|
||||
|
||||
log.log(
|
||||
SUCCESS,
|
||||
"New incident costs distribution successfully generated for '%s'.",
|
||||
str(pairing),
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
def _generate_new_distributions(pairing: Tuple = (None, None)) -> Tuple:
|
||||
"""(Re)generates the cost and likelihood distributions."""
|
||||
|
||||
gi.__init__()
|
||||
|
||||
log.info("Existing distributions deleted: %s", bool(gi.delete_distributions()))
|
||||
|
||||
successful_incidents_dists = 0
|
||||
successful_costs_dists = 0
|
||||
|
||||
# If either size or industry is unspecified, gets all possible values.
|
||||
sizes = gi.get_sizes() if pairing[0] is None else [pairing[0]]
|
||||
industries = gi.get_industries() if pairing[1] is None else [pairing[1]]
|
||||
|
||||
# Attempts to generate new distributions for every combination of size and
|
||||
# industry values.
|
||||
for pair in list(itertools.product(sizes, industries)):
|
||||
successful_incidents_dists += _generate_new_incident_frequency_distribution(
|
||||
pair
|
||||
)
|
||||
successful_costs_dists += _generate_new_incident_costs_distribution(pair)
|
||||
|
||||
return successful_incidents_dists, successful_costs_dists
|
||||
|
||||
|
||||
def main():
|
||||
"""Called when the script is run from the command-line."""
|
||||
# pylint: disable=global-statement
|
||||
global OUTPUT_DIR, IMAGES
|
||||
# pylint: enable=global-statement
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--size",
|
||||
help="Specify the org. size (default: None)",
|
||||
choices=["micro", "small", "medium", "large"],
|
||||
type=str,
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--industry",
|
||||
help="Specify the org. industry SIC code (top-level only, e.g. ‘C’ for "
|
||||
"Manufacturing’) (default: None)",
|
||||
choices=list(map(chr, range(65, 86))),
|
||||
type=chr,
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Specify the output directory (default: ./output/)",
|
||||
type=str,
|
||||
default=os.path.join(os.path.dirname(__file__), "output/"),
|
||||
metavar="DIRECTORY",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--images",
|
||||
help="Output images at each step of the script (default: false, just "
|
||||
"output the final LEC image)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
help="Verbose console output (default: false)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
help="Show debug console output (default: false)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
OUTPUT_DIR = args.output
|
||||
IMAGES = args.images
|
||||
|
||||
size = args.size
|
||||
industry = args.industry
|
||||
|
||||
if args.debug:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
|
||||
log.info("Debug output.")
|
||||
elif args.verbose:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s", level=log.INFO)
|
||||
log.info("Verbose output.")
|
||||
else:
|
||||
log.basicConfig(format="%(levelname)s: %(message)s")
|
||||
|
||||
if not os.path.isdir(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
incidents_dists, costs_dists = _generate_new_distributions((size, industry))
|
||||
|
||||
log.log(
|
||||
SUCCESS,
|
||||
"Successfully generated %s incident frequency distributions and %s "
|
||||
"incident costs distributions!",
|
||||
str(incidents_dists),
|
||||
str(costs_dists),
|
||||
)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def _label_plot(title="Untitled Plot", xlabel="x axis", ylabel="y axis") -> None:
|
||||
"""Apply titles and axis labels to a plot."""
|
||||
|
||||
plt.title(title)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
|
||||
|
||||
def _save_plot(filename="untitled") -> None:
|
||||
"""Save a plot and clear the figure."""
|
||||
|
||||
if IMAGES:
|
||||
plt.savefig(OUTPUT_DIR + filename + ".png")
|
||||
plt.clf()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in a new issue