Just Scraping Alpha

2017-01-13 17:56:41 +00:00 · 2017-01-13 17:56:41 +00:00 · b6c1660f04
commit b6c1660f04
parent 75e3dc5790
24 changed files with 8 additions and 1000 deletions
--- a/Scraping_Alpha/JSONtoSQL.py
+++ b/Scraping_Alpha/JSONtoSQL.py
@ -0,0 +1,126 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file takes the `transcripts.json` file output of `transcript_spider.py`
+#	and converts it into SQL.
+#	
+#	This file should be located in the same directory as `transcripts.json`, and
+#	is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
+#	the output file. 
+#
+
+import json
+import sys
+import codecs
+
+sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
+
+json_data=open('transcripts.json').read()
+
+data = json.loads(json_data)
+
+executives = []
+analysts = []
+
+# For each transcript, creates new, separate arrays of executives and analysts
+# for their own database tables, replacing their tuples in the transcript with
+# their database keys.
+for entry in data:
+	indexExec = len(executives)+1
+	indexAnal = len(analysts)+1
+	
+	newExecs = []
+	for executive in entry['entry']['exec']:
+		if executive not in executives:
+			executives.append(executive)
+			newExecs.append(indexExec)
+			indexExec += 1
+		else:
+			newExecs.append(executives.index(executive) + 1)
+	entry['entry']['exec'] = newExecs
+	
+	newAnals = []
+	for analyst in entry['entry']['analysts']:
+		if analyst not in analysts:
+			analysts.append(analyst)
+			newAnals.append(indexAnal)
+			indexAnal += 1
+		else:
+			newAnals.append(analysts.index(analyst) + 1)
+	entry['entry']['analysts'] = newAnals
+
+# Outputs the SQL file that creates the various tables and populates them with
+# INSERT statements.
+print "CREATE TABLE IF NOT EXISTS `execs`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`name` VARCHAR(255),"
+print "\t`position` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
+print "\t(0,0,0)",
+for executive in executives:
+	print ","
+	print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `analysts`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`name` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `analysts` (`name`, `company`) VALUES"
+print "\t(0,0)",
+for analyst in analysts:
+	print ","
+	print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `transcripts`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`title` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\t`execs` VARCHAR(255),"
+print "\t`analysts` VARCHAR(255),"
+print "\t`transcript` TEXT,"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
+print "\t(0,0,0,0,0)",
+for entry in data:
+	tran = entry['entry']
+	print ","
+	print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
+print "("
+print "\t`exec_id` INT NOT NULL,"
+print "\t`transcript_id` INT NOT NULL,"
+print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
+print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
+print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
+print ");\n"
+
+print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
+print "("
+print "\t`analyst_id` INT NOT NULL,"
+print "\t`transcript_id` INT NOT NULL,"
+print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
+print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
+print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
+print ");"
--- a/Scraping_Alpha/Scraping_Alpha/init.py
+++ b/Scraping_Alpha/Scraping_Alpha/init.py
--- a/Scraping_Alpha/Scraping_Alpha/init.pyc
+++ b/Scraping_Alpha/Scraping_Alpha/init.pyc
--- a/Scraping_Alpha/Scraping_Alpha/items.py
+++ b/Scraping_Alpha/Scraping_Alpha/items.py
@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrapingAlphaItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/Scraping_Alpha/Scraping_Alpha/middlewares.py
+++ b/Scraping_Alpha/Scraping_Alpha/middlewares.py
@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ScrapingAlphaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/Scraping_Alpha/Scraping_Alpha/pipelines.py
+++ b/Scraping_Alpha/Scraping_Alpha/pipelines.py
@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class ScrapingAlphaPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/Scraping_Alpha/Scraping_Alpha/settings.py
+++ b/Scraping_Alpha/Scraping_Alpha/settings.py
@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for Scraping_Alpha project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'Scraping_Alpha'
+
+SPIDER_MODULES = ['Scraping_Alpha.spiders']
+NEWSPIDER_MODULE = 'Scraping_Alpha.spiders'
+
+USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'Scraping_Alpha (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'Scraping_Alpha.middlewares.ScrapingAlphaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'Scraping_Alpha.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'Scraping_Alpha.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/Scraping_Alpha/Scraping_Alpha/settings.pyc
+++ b/Scraping_Alpha/Scraping_Alpha/settings.pyc
--- a/Scraping_Alpha/Scraping_Alpha/spiders/init.py
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/Scraping_Alpha/Scraping_Alpha/spiders/init.pyc
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/init.pyc
--- a/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
@ -0,0 +1,79 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file is the webspider that Scrapy uses to retrieve slides.
+#
+
+import scrapy
+urls = []
+# A transcript record can be uniquely identified using it's company name + date.
+uniqueID = ""
+# Some transcript preambles are concatenated on a single line. This list is used
+# To separate the title and date sections of the string.
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+
+class SlidesSpider(scrapy.Spider):
+    name = 'slides'
+    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
+	
+    def parse(self, response):
+        # Follows each transcript page's link from the given index page.
+        for href in response.css('.dashboard-article-link::attr(href)').extract():
+            yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
+            
+        # Follows the pagination links at the bottom of given index page.
+        next_page = response.css('li.next a::attr(href)').extract_first()
+        if next_page is not None:
+			next_page = response.urljoin(next_page)
+			yield scrapy.Request(next_page, callback=self.parse)
+    
+    def parse_transcript(self, response):
+        slides = response.css('li#slides a::attr(href)').extract_first()
+    	if slides is not None:
+			body = response.css('div#a-body p.p1')
+			chunks = body.css('p.p1')
+			i = 0
+			while i < 3:
+				# If we're on the first line of the preamble, that's the
+				# company name, stock exchange and ticker acroynm (or should
+				# be - see below)
+				if i == 0:
+					# Checks to see if the second line is a heading. If not,
+					# everything is fine.
+					if len(chunks[1].css('strong::text').extract()) == 0:
+						uniqueID = chunks[i].css('p::text').extract_first()
+						if " (" in uniqueID:
+							uniqueID = uniqueID.split(' (')[0]
+						i = 2
+					# However, if it is, that means this line contains the
+					# full, concatenated preamble, so everything must be 
+					# extracted here
+					else:
+						uniqueID = chunks[i].css('p::text').extract_first()
+						if " (" in uniqueID:
+							uniqueID = uniqueID.split(' (')[0]
+						titleAndDate = chunks[i].css('p::text').extract[1]
+						for date in months:
+							if date in titleAndDate:
+								splits = titleAndDate.split(date)
+								uniqueID = uniqueID + ";" + date + splits[1]
+						i = 3
+				# Otherwise, we're onto the date line.
+				elif i == 2:
+					uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first()
+				i += 1
+			
+			slides = response.urljoin(slides)
+			yield uniqueID
+			#yield scrapy.Request(sides, callback=self.parse_slides)
+			
+	def parse_slides(self, response):
+		urls = response.css('figure img::attr(src)').extract()
+		yield uniqueID + "\\\\" + ';'.join(urls)
+		
--- a/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc
--- a/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
@ -0,0 +1,168 @@
+#-*- coding: utf-8 -*-
+
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file is the webspider that Scrapy uses to retrieve the information from
+#	the website. Left unattended, it will scrape all 4,000+ pages of results.
+#	
+#	To interrupt this behaviour and still be able to proceed with the other
+#	steps, cancel the script with CTRL+Z. This will likely leave an unfinished
+#	JSON item at the end of the output file. To clear this up, open the file
+#	in vim and type the following keys: 
+#		'G', 'V', 'd', '$', 'i', 'BACKSPACE', 'ENTER', ']', 'ESC', ':wp', 'ENTER'
+#	This will truncate the file at the last complete record and seal it off.
+#
+# 	For installation instructions for Scrapy, visit 
+# 	<doc.scrapy.org/en/latest/intro/install.html>. This file should be in the
+# 	`spiders` directory of the project, and is run via 'scrapy crawl transcripts 
+# 	-o transcripts.json' at the command line (the output file will be placed
+#	in the directory the Terminal is currently in).
+#
+
+import scrapy
+# This enum lists the stages of each transcript.
+from enum import Enum
+Stage = Enum('Stage', 'preamble execs analysts body')
+# Some transcript preambles are concatenated on a single line. This list is used
+# To separate the title and date sections of the string.
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+transcripts = {}
+
+class TranscriptSpider(scrapy.Spider):
+    name = 'transcripts'
+    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
+	
+    def parse(self, response):
+        # Follows each transcript page's link from the given index page.
+        for href in response.css('.dashboard-article-link::attr(href)').extract():
+            yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
+            
+        # Follows the pagination links at the bottom of given index page.
+        next_page = response.css('li.next a::attr(href)').extract_first()
+        if next_page is not None:
+			next_page = response.urljoin(next_page)
+			yield scrapy.Request(next_page, callback=self.parse)
+    
+    def parse_transcript(self, response):
+		i = 0
+		transcript = {}
+		details = {}
+		execs = []
+		analysts = []
+		script = []
+		mode = 1
+		
+		# As the pages are represented by a series of `<p>` elements, all with
+		# the same class `.p1` and no unique identfiers, we have to do this the
+		# old-fashioned way - breaking it into chunks and iterating over them.
+		body = response.css('div#a-body p.p1')
+		chunks = body.css('p.p1')
+		while i < len(chunks):
+			# If the current line is a heading and we're not currently going
+			# through the transcript body (where headings represent speakers),
+			# change the current section flag to the next section.
+			if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
+				currStage = Stage(mode)
+				# If we're on the preamble stage, each bit of data is extracted
+				# separately as they all have their own key in the JSON.
+				if currStage == Stage['preamble']:
+					# If we're on the first line of the preamble, that's the
+					# company name, stock exchange and ticker acroynm (or should
+					# be - see below)
+					if i == 0:
+						# Checks to see if the second line is a heading. If not,
+						# everything is fine.
+						if len(chunks[1].css('strong::text').extract()) == 0:
+							details['company'] = chunks[i].css('p::text').extract_first()
+							if " (" in details['company']:
+								details['company'] = details['company'].split(' (')[0]
+							# If a specific stock exchange is not listed, it
+							# defaults to NYSE
+							details['exchange'] = "NYSE"
+							details['ticker'] = chunks.css('a::text').extract_first()
+							if ":" in details['ticker']:
+								ticker = details['ticker'].split(':')
+								details['exchange'] = ticker[0]
+								details['ticker'] = ticker[1]
+						# However, if it is, that means this line contains the
+						# full, concatenated preamble, so everything must be 
+						# extracted here
+						else:
+							details['company'] = chunks[i].css('p::text').extract_first()
+							if " (" in details['company']:
+								details['company'] = details['company'].split(' (')[0]
+							# if a specific stock exchange is not listed, default to NYSE
+							details['exchange'] = "NYSE"
+							details['ticker'] = chunks.css('a::text').extract_first()
+							if ":" in details['ticker']:
+								ticker = details['ticker'].split(':')
+								details['exchange'] = ticker[0]
+								details['ticker'] = ticker[1]
+							titleAndDate = chunks[i].css('p::text').extract[1]
+							for date in months:
+								if date in titleAndDate:
+									splits = titleAndDate.split(date)
+									details['title'] = splits[0]
+									details['date'] = date + splits[1]
+					# Otherwise, we're onto the title line.
+					elif i == 1:
+						title = chunks[i].css('p::text').extract_first()
+						# This should never be the case, but just to be careful
+						# I'm leaving it in.
+						if len(title) <= 0:
+							title = "NO TITLE"
+						details['title'] = title
+					# Or the date line.
+					elif i == 2:
+						details['date'] = chunks[i].css('p::text').extract_first()
+				# If we're onto the 'Executives' section, we create a list of
+				# all of their names, positions and company name (from the 
+				# preamble).
+				elif currStage == Stage['execs']:					
+					anExec = chunks[i].css('p::text').extract_first().split(" - ")
+					# This covers if the execs are separated with an em- rather
+					# than an en-dash (see above).
+					if len(anExec) <= 1:
+						anExec = chunks[i].css('p::text').extract_first().split(" – ")
+					name = anExec[0]
+					if len(anExec) > 1:
+						position = anExec[1]
+					# Again, this should never be the case, as an Exec-less
+					# company would find it hard to get much done.
+					else:
+						position = ""
+					execs.append((name,position,details['company']))
+				# This does the same, but with the analysts (which never seem
+				# to be separated by em-dashes for some reason).
+				elif currStage == Stage['analysts']:
+					name = chunks[i].css('p::text').extract_first().split(" - ")[0]
+					company = chunks[i].css('p::text').extract_first().split(" - ")[1]
+					analysts.append((name,company))
+				# This strips the transcript body of everything except simple
+				# HTML, and stores that.
+				elif currStage == Stage['body']:
+					line = chunks[i].css('p::text').extract_first()
+					html = "p>"
+					if line is None:
+						line = chunks[i].css('strong::text').extract_first()
+						html = "h1>"
+					script.append("<"+html+line+"</"+html)
+			else:
+				mode += 1
+			i += 1
+		
+		# Adds the various arrays to the dictionary for the transcript
+		details['exec'] = execs 
+		details['analysts'] = analysts
+		details['transcript'] = ''.join(script)
+		
+		# Adds this transcript to the dictionary of all scraped
+		# transcripts, and yield that for the output
+		transcript["entry"] = details
+		yield transcript
--- a/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
+++ b/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
--- a/Scraping_Alpha/execsAndAnalysts.py
+++ b/Scraping_Alpha/execsAndAnalysts.py
@ -0,0 +1,111 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file takes the `transcripts.sql` file exported from the database
+# 	created using the output file of `JSONtoSQL.py` after the following query:
+#		SELECT `id`, `execs`, `analysts` FROM `transcripts`
+#	It creates from this two `execs.sql` and `analysts.sql` for creating linking
+#	tables in the database.
+#	
+#	This file should be located in the same directory as `transcripts.sql`, and
+#	is run via 'python execsAndAnalysts'.
+#
+
+import sys
+import codecs
+import os
+from shutil import copyfile
+import fileinput
+
+sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
+
+start = 0
+
+# Creates a temporary copy in case something goes Pete Tong.
+copyfile("transcripts.sql", "transcripts.sql.tmp")
+
+# Trims everything from the export except for the INSERT statement.
+for line in fileinput.FileInput("transcripts.sql.tmp",inplace=1):
+	if start == 0:
+		if "INSERT INTO `transcripts`" in line:
+			start = 1
+			print "INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES"
+	else:
+		if line == "\n":
+			start = 0
+		else:
+			print "\t"+line,
+
+# Copies the produced file to create both the output files, then deletes the
+# temporary file.
+copyfile("transcripts.sql.tmp", "execs.sql")
+copyfile("transcripts.sql.tmp", "analysts.sql")
+os.remove("transcripts.sql.tmp")
+
+# Converts each line "(x, '0;...;n')" in the file to n separate INSERTs, one
+# for each executive.
+start = 0
+nL = ""
+for line in fileinput.FileInput("execs.sql",inplace=1):
+	if start == 0:
+		start = 1
+	else:
+		bits = line.split(', ')
+		tID = bits[0].strip('\t').strip('(')
+		execs = bits[1].split(';')
+		newLines = ""
+		for execID in execs:
+			newLines = newLines + nL + "\t("+tID+", "+execID.strip('\'')+"),"
+		line = line.replace(line, newLines)
+		nL = "\n"
+	print line,
+	
+# Does the same for the analysts.
+start = 0
+nL = ""
+for line in fileinput.FileInput("analysts.sql",inplace=1):
+	if start == 0:
+		start = 1
+		line = line.replace(line, "INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES\n")
+	else:
+		bits = line.split(', ')
+		tID = bits[0].strip('\t').strip('(')
+		# As it is possible for there to be no analysts in a call, this ignores
+		# blank results.
+		if "''" not in bits[2]:
+			analysts = bits[2].split(';')
+			newLines = ""
+			for analystID in analysts:
+				# This stops the final transcript from getting an additional, 
+				# `analyst_id`-less INSERT
+				if analystID != '\n':
+					newLines = newLines + nL + "\t("+tID+", "+analystID.strip('\'').strip('\'),\n')+"),"
+			line = line.replace(line, newLines)
+			nL = "\n"
+		else:
+			line = ""
+	print line,
+	
+# Replace the final comma at the end of each file with a semicolon, to make it
+# valid SQL
+with open("execs.sql", 'rb+') as filehandle:
+    filehandle.seek(-1, os.SEEK_END)
+    filehandle.truncate()
+    filehandle.write(";")
+with open("analysts.sql", 'rb+') as filehandle:
+    filehandle.seek(-1, os.SEEK_END)
+    filehandle.truncate()
+    filehandle.write(";")
+    # `analysts.sql` then performs some cleanup on the database.
+    filehandle.write("\n\nALTER TABLE `transcripts`\n\tDROP COLUMN `execs`,\n\tDROP COLUMN `analysts`;\n\n")
+    filehandle.write("DELETE FROM `transcripts` WHERE `id` = 0;\n")
+    filehandle.write("DELETE FROM `execs` WHERE `id` = 0;\n")
+    filehandle.write("DELETE FROM `analysts` WHERE `id` = 0;\n")
+    filehandle.write("DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;\n")
+    filehandle.write("DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;\n")
--- a/Scraping_Alpha/scrapy.cfg
+++ b/Scraping_Alpha/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = Scraping_Alpha.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = Scraping_Alpha
--- a/Scraping_Alpha/urls.json
+++ b/Scraping_Alpha/urls.json