1.0 fuck yes

2016-12-26 23:25:34 +00:00 · 2016-12-26 23:25:34 +00:00 · 57800c7114
commit 57800c7114
parent f0998ecd98
17 changed files with 761 additions and 73 deletions
--- a/scraping-alpha/Scraping_Alpha/JSONtoSQL.py
+++ b/scraping-alpha/Scraping_Alpha/JSONtoSQL.py
@ -0,0 +1,126 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file takes the `transcripts.json` file output of `transcript_spider.py`
+#	and converts it into SQL.
+#	
+#	This file should be located in the same directory as `transcripts.json`, and
+#	is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
+#	the output file. 
+#
+
+import json
+import sys
+import codecs
+
+sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
+
+json_data=open('transcripts.json').read()
+
+data = json.loads(json_data)
+
+executives = []
+analysts = []
+
+# For each transcript, creates new, separate arrays of executives and analysts
+# for their own database tables, replacing their tuples in the transcript with
+# their database keys.
+for entry in data:
+	indexExec = len(executives)+1
+	indexAnal = len(analysts)+1
+	
+	newExecs = []
+	for executive in entry['entry']['exec']:
+		if executive not in executives:
+			executives.append(executive)
+			newExecs.append(indexExec)
+			indexExec += 1
+		else:
+			newExecs.append(executives.index(executive) + 1)
+	entry['entry']['exec'] = newExecs
+	
+	newAnals = []
+	for analyst in entry['entry']['analysts']:
+		if analyst not in analysts:
+			analysts.append(analyst)
+			newAnals.append(indexAnal)
+			indexAnal += 1
+		else:
+			newAnals.append(analysts.index(analyst) + 1)
+	entry['entry']['analysts'] = newAnals
+
+# Outputs the SQL file that creates the various tables and populates them with
+# INSERT statements.
+print "CREATE TABLE IF NOT EXISTS `execs`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`name` VARCHAR(255),"
+print "\t`position` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
+print "\t(0,0,0)",
+for executive in executives:
+	print ","
+	print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `analysts`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`name` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `analysts` (`name`, `company`) VALUES"
+print "\t(0,0)",
+for analyst in analysts:
+	print ","
+	print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `transcripts`"
+print "("
+print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
+print "\t`title` VARCHAR(255),"
+print "\t`company` VARCHAR(255),"
+print "\t`execs` VARCHAR(255),"
+print "\t`analysts` VARCHAR(255),"
+print "\t`transcript` TEXT,"
+print "\tPRIMARY KEY(`id`)"
+print ");\n"
+
+print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
+print "\t(0,0,0,0,0)",
+for entry in data:
+	tran = entry['entry']
+	print ","
+	print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
+print ";\n"
+
+print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
+print "("
+print "\t`exec_id` INT NOT NULL,"
+print "\t`transcript_id` INT NOT NULL,"
+print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
+print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
+print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
+print ");\n"
+
+print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
+print "("
+print "\t`analyst_id` INT NOT NULL,"
+print "\t`transcript_id` INT NOT NULL,"
+print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
+print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
+print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
+print ");"
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/init.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/init.py
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/init.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/init.pyc
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/items.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/items.py
@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrapingAlphaItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/middlewares.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/middlewares.py
@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ScrapingAlphaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/pipelines.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/pipelines.py
@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class ScrapingAlphaPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.py
@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for Scraping_Alpha project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'Scraping_Alpha'
+
+SPIDER_MODULES = ['Scraping_Alpha.spiders']
+NEWSPIDER_MODULE = 'Scraping_Alpha.spiders'
+
+USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'Scraping_Alpha (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'Scraping_Alpha.middlewares.ScrapingAlphaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'Scraping_Alpha.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'Scraping_Alpha.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.pyc
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/init.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/init.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/init.pyc
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
@ -0,0 +1,171 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file is the webspider that Scrapy uses to retrieve the information from
+#	the website. Left unattended, it will scrape all 4,000+ pages of results.
+#	
+#	To interrupt this behaviour and still be able to proceed with the other
+#	steps, cancel the script with CTRL+Z. This will likely leave an unfinished
+#	JSON item at the end of the output file. To clear this up, open the file
+#	in vim and type the following keys: 
+#		'G', 'V', 'd', '$', 'i', 'BACKSPACE', 'ENTER', ']', 'ESC', ':wp', 'ENTER'
+#	This will truncate the file at the last complete record and seal it off.
+#
+# 	For installation instructions for Scrapy, visit 
+# 	<doc.scrapy.org/en/latest/intro/install.html>. This file should be in the
+# 	`spiders` directory of the project, and is run via 'scrapy crawl transcripts 
+# 	-o transcripts.json' at the command line (the output file will be placed
+#	in the directory the Terminal is currently in).
+#
+
+# Some of the <Exec, Position> tuples are separate by an em- rather than an
+# en-dash, which isn't featured in the ASCII charset, hence the below line:
+#-*- coding: utf-8 -*-
+
+import scrapy
+# This enum lists the stages of each transcript.
+from enum import Enum
+Stage = Enum('Stage', 'preamble execs analysts body')
+# Some transcript preambles are concatenated on a single line. This list is used
+# To separate the title and date sections of the string.
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+transcripts = {}
+
+class TranscriptSpider(scrapy.Spider):
+    name = 'transcripts'
+    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
+	
+    def parse(self, response):
+        # Follows each transcript page's link from the given index page.
+        for href in response.css('.dashboard-article-link::attr(href)').extract():
+            yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
+            
+        # Follows the pagination links at the bottom of given index page.
+        next_page = response.css('li.next a::attr(href)').extract_first()
+        if next_page is not None:
+			next_page = response.urljoin(next_page)
+			yield scrapy.Request(next_page, callback=self.parse)
+    
+    def parse_transcript(self, response):
+		i = 0
+		transcript = {}
+		details = {}
+		execs = []
+		analysts = []
+		script = []
+		mode = 1
+		
+		# As the pages are represented by a series of `<p>` elements, all with
+		# the same class `.p1` and no unique identfiers, we have to do this the
+		# old-fashioned way - breaking it into chunks and iterating over them.
+		body = response.css('div#a-body p.p1')
+		chunks = body.css('p.p1')
+		while i < len(chunks):
+			# If the current line is a heading and we're not currently going
+			# through the transcript body (where headings represent speakers),
+			# change the current section flag to the next section.
+			if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
+				currStage = Stage(mode)
+				# If we're on the preamble stage, each bit of data is extracted
+				# separately as they all have their own key in the JSON.
+				if currStage == Stage['preamble']:
+					# If we're on the first line of the preamble, that's the
+					# company name, stock exchange and ticker acroynm (or should
+					# be - see below)
+					if i == 0:
+						# Checks to see if the second line is a heading. If not,
+						# everything is fine.
+						if len(chunks[1].css('strong::text').extract()) == 0:
+							details['company'] = chunks[i].css('p::text').extract_first()
+							if " (" in details['company']:
+								details['company'] = details['company'].split(' (')[0]
+							# If a specific stock exchange is not listed, it
+							# defaults to NYSE
+							details['exchange'] = "NYSE"
+							details['ticker'] = chunks.css('a::text').extract_first()
+							if ":" in details['ticker']:
+								ticker = details['ticker'].split(':')
+								details['exchange'] = ticker[0]
+								details['ticker'] = ticker[1]
+						# However, if it is, that means this line contains the
+						# full, concatenated preamble, so everything must be 
+						# extracted here
+						else:
+							details['company'] = chunks[i].css('p::text').extract_first()
+							if " (" in details['company']:
+								details['company'] = details['company'].split(' (')[0]
+							# if a specific stock exchange is not listed, default to NYSE
+							details['exchange'] = "NYSE"
+							details['ticker'] = chunks.css('a::text').extract_first()
+							if ":" in details['ticker']:
+								ticker = details['ticker'].split(':')
+								details['exchange'] = ticker[0]
+								details['ticker'] = ticker[1]
+							titleAndDate = chunks[i].css('p::text').extract[1]
+							for date in months:
+								if date in titleAndDate:
+									splits = titleAndDate.split(date)
+									details['title'] = splits[0]
+									details['date'] = dates + splits[1]
+					# Otherwise, we're onto the title line.
+					elif i == 1:
+						title = chunks[i].css('p::text').extract_first()
+						# This should never be the case, but just to be careful
+						# I'm leaving it in.
+						if len(title) <= 0:
+							title = "NO TITLE"
+						details['title'] = title
+					# Or the date line.
+					elif i == 2:
+						details['date'] = chunks[i].css('p::text').extract_first()
+				# If we're onto the 'Executives' section, we create a list of
+				# all of their names, positions and company name (from the 
+				# preamble).
+				elif currStage == Stage['execs']:					
+					anExec = chunks[i].css('p::text').extract_first().split(" - ")
+					# This covers if the execs are separated with an em- rather
+					# than an en-dash (see above).
+					if len(anExec) <= 1:
+						anExec = chunks[i].css('p::text').extract_first().split(" – ")
+					name = anExec[0]
+					if len(anExec) > 1:
+						position = anExec[1]
+					# Again, this should never be the case, as an Exec-less
+					# company would find it hard to get much done.
+					else:
+						position = ""
+					execs.append((name,position,details['company']))
+				# This does the same, but with the analysts (which never seem
+				# to be separated by em-dashes for some reason).
+				elif currStage == Stage['analysts']:
+					name = chunks[i].css('p::text').extract_first().split(" - ")[0]
+					company = chunks[i].css('p::text').extract_first().split(" - ")[1]
+					analysts.append((name,company))
+				# This strips the transcript body of everything except simple
+				# HTML, and stores that.
+				elif currStage == Stage['body']:
+					line = chunks[i].css('p::text').extract_first()
+					html = "p>"
+					if line is None:
+						line = chunks[i].css('strong::text').extract_first()
+						html = "h1>"
+					script.append("<"+html+line+"</"+html)
+			else:
+				mode += 1
+			i += 1
+		
+		# Adds the various arrays to the dictionary for the transcript
+		details['exec'] = execs 
+		details['analysts'] = analysts
+		details['transcript'] = ''.join(script)
+		
+		# Adds this transcript to the dictionary of all scraped
+		# transcripts, and yield that for the output
+		transcript["entry"] = details
+		yield transcript
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
--- a/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py
+++ b/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py
@ -0,0 +1,104 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file takes the `transcripts.sql` file exported from the database
+# 	created using the output file of `JSONtoSQL.py` after the following query:
+#		SELECT `id`, `execs`, `analysts` FROM `transcripts`
+#	It creates from this two `execs.sql` and `analysts.sql` for creating linking
+#	tables in the database.
+#	
+#	This file should be located in the same directory as `transcripts.sql`, and
+#	is run via 'python execsAndAnalysts'.
+#
+
+import sys
+import codecs
+import os
+from shutil import copyfile
+import fileinput
+
+sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
+
+start = 0
+
+# Creates a temporary copy in case something goes Pete Tong.
+copyfile("transcripts.sql", "transcripts.sql.tmp")
+
+# Trims everything from the export except for the INSERT statement.
+for line in fileinput.FileInput("transcripts.sql.tmp",inplace=1):
+	if start == 0:
+		if "INSERT INTO `transcripts`" in line:
+			start = 1
+			print "INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES"
+	else:
+		if line == "\n":
+			start = 0
+		else:
+			print "\t"+line,
+
+# Copies the produced file to create both the output files, then deletes the
+# temporary file.
+copyfile("transcripts.sql.tmp", "execs.sql")
+copyfile("transcripts.sql.tmp", "analysts.sql")
+os.remove("transcripts.sql.tmp")
+
+# Converts each line "(x, '0;...;n')" in the file to n separate INSERTs, one
+# for each executive.
+start = 0
+nL = ""
+for line in fileinput.FileInput("execs.sql",inplace=1):
+	if start == 0:
+		start = 1
+	else:
+		bits = line.split(', ')
+		tID = bits[0].strip('\t').strip('(')
+		execs = bits[1].split(';')
+		newLines = ""
+		for execID in execs:
+			newLines = newLines + nL + "\t("+tID+", "+execID.strip('\'')+"),"
+		line = line.replace(line, newLines)
+		nL = "\n"
+	print line,
+	
+# Does the same for the analysts.
+start = 0
+nL = ""
+for line in fileinput.FileInput("analysts.sql",inplace=1):
+	if start == 0:
+		start = 1
+		line = line.replace(line, "INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES\n")
+	else:
+		bits = line.split(', ')
+		tID = bits[0].strip('\t').strip('(')
+		# As it is possible for there to be no analysts in a call, this ignores
+		# blank results.
+		if "''" not in bits[2]:
+			analysts = bits[2].split(';')
+			newLines = ""
+			for analystID in analysts:
+				# This stops the final transcript from getting an additional, 
+				# `analyst_id`-less INSERT
+				if analystID != '\n':
+					newLines = newLines + nL + "\t("+tID+", "+analystID.strip('\'').strip('\'),\n')+"),"
+			line = line.replace(line, newLines)
+			nL = "\n"
+		else:
+			line = ""
+	print line,
+	
+# Replace the final comma at the end of each file with a semicolon, to make it
+# valid SQL
+with open("execs.sql", 'rb+') as filehandle:
+    filehandle.seek(-1, os.SEEK_END)
+    filehandle.truncate()
+    filehandle.write(";")
+with open("analysts.sql", 'rb+') as filehandle:
+    filehandle.seek(-1, os.SEEK_END)
+    filehandle.truncate()
+    filehandle.write(";")
--- a/scraping-alpha/Scraping_Alpha/scrapy.cfg
+++ b/scraping-alpha/Scraping_Alpha/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = Scraping_Alpha.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = Scraping_Alpha
--- a/scraping-alpha/Scraping_Alpha/transcripts.json
+++ b/scraping-alpha/Scraping_Alpha/transcripts.json
--- a/scraping-alpha/Scraping_Alpha/transcripts.sql
+++ b/scraping-alpha/Scraping_Alpha/transcripts.sql
@ -0,0 +1,110 @@
+-- phpMyAdmin SQL Dump
+-- version 4.0.10.14
+-- http://www.phpmyadmin.net
+--
+-- Host: localhost:3306
+-- Generation Time: Dec 26, 2016 at 02:39 PM
+-- Server version: 5.5.52-cll-lve
+-- PHP Version: 5.6.20
+
+SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
+SET time_zone = "+00:00";
+
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!40101 SET NAMES utf8 */;
+
+--
+-- Database: `bengoldsworthy`
+--
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `transcripts`
+--
+
+CREATE TABLE IF NOT EXISTS `transcripts` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `title` varchar(255) DEFAULT NULL,
+  `company` varchar(255) DEFAULT NULL,
+  `execs` varchar(255) DEFAULT NULL,
+  `analysts` varchar(255) DEFAULT NULL,
+  `transcript` text,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `id` (`id`)
+) ENGINE=MyISAM  DEFAULT CHARSET=latin1 AUTO_INCREMENT=63 ;
+
+--
+-- Dumping data for table `transcripts`
+--
+
+INSERT INTO `transcripts` (`id`, `execs`, `analysts`) VALUES
+(1, '0', '0'),
+(2, '1;2;3;4', '1'),
+(3, '5;6;7', '2;3;4;5;6;7;8;9;10'),
+(4, '8;9;10', '11;12;13;14;15;16;17;18;19;20;21;22;23;24;25'),
+(5, '11;12;13;14', '26;27;28'),
+(6, '15;16;17;18;19', '29;30;31;32;33;34;35;36;37;38;39'),
+(7, '20;21;22;23', '40;41;42;43;44;45;46;47;48'),
+(8, '24;25;26', '49;50;51;52;53;54;55;56;57;58;59;60;61;62;63;64;65;66;67;68'),
+(9, '27;28;29;30', '69;70;71'),
+(10, '31;32', '72;73;74;75;76;77;78;79;80;81'),
+(11, '33;34;35', '82;83;84;85;86;87;88;89;90;91;92;93;94;95'),
+(12, '36;37;38;39', '96;97;98;81;99'),
+(13, '40;41;42', '100;101;102;103;104;105;106'),
+(14, '43;44;45;46;47;48;49;50;51', '107;108;109;110;111;112;113;114;115;116;117'),
+(15, '52;53;54', '13;118;119;120;23;121'),
+(16, '55;56;57', '122;39;123;124;125;126'),
+(17, '58;59;60;61', '127;128;129;130;131'),
+(18, '62;63', '132;133'),
+(19, '64;65;66', '134;135;136;137;138;139;140;141;142'),
+(20, '67;68', '141;143;144;145;146;142;147;148;149;95;150;151;152;153'),
+(21, '69;70;71', '154;155;156;157;158;159;160;161;162;163;164;165;166;167;168;169;170'),
+(22, '72;73;74', '171;172;173;174'),
+(23, '75;76;77;78', '175;176;177;178;179;180;181'),
+(24, '79;80;81', '182;183'),
+(25, '82;83;84', '184;185;186;187;188;189;190;191;192;193;194;195;196;197;198'),
+(26, '85;86;87;88', '199;200;201;202;203;204;205;206;207;208;209;210;211;212;213'),
+(27, '89;90;91;92;93', '214;215;216;44;217;218;41;219;45;220'),
+(28, '94;95;96', '71;70'),
+(29, '97;98;99', ''),
+(30, '100;101', '221;222;131'),
+(31, '102;103;104;105', '223;224;225'),
+(32, '106;107', '226;85;227;228;229;230;231;232;112;233;234;235'),
+(33, '108;109', '236;237;238'),
+(34, '110;111', '239;240;241;242;243;244'),
+(35, '112;113;114', '245;246'),
+(36, '115;116;117;118', '247;248'),
+(37, '119;120;121', '249;250'),
+(38, '122;123', '251;252;253'),
+(39, '124;125;126', '254;255;256;257;258;259;260;261;262'),
+(40, '127;128;129', '263;264;265'),
+(41, '130;131;132;133', '266;267'),
+(42, '134;135;136;137;138;139', '268;269;270;271;272'),
+(43, '140;141;142', '190;273;274;275;276;277;278;279;280;186'),
+(44, '143;144;145', '281;164;282;154;283;284;285;286;287;288;289;290;291'),
+(45, '146;147;148', '292;293;294'),
+(46, '149;150;151', '295'),
+(47, '152;153;154;155', '165;296;166;297;298;299;160;300'),
+(48, '156;157;158', '301;302;303;304;305;306;307;308'),
+(49, '159;160;161', '309;310;311'),
+(50, '162;163', '312;313'),
+(51, '164;165', '314'),
+(52, '166;167;168', '315;316;317;318'),
+(53, '169;170;171', '319;320'),
+(54, '172;173;174;175;176;177', '321;322;323;324;325;326;327;328'),
+(55, '178;179;180', '329;330;331;332;333'),
+(56, '181;182', '334;335'),
+(57, '183;184;185', '336;337;338;339;340;341;342;343'),
+(58, '186;187;188;189;190;191;192;193;194;195', '344;345;238;346;347;348;349;350;351;352;353;237;354'),
+(59, '196;197;198', '355;356;357;358;359'),
+(60, '199;200;201', '360;361'),
+(61, '202;203', '362;363'),
+(62, '204;205;206', '364;365');
+
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
--- a/seeking-alpha-scraper/transcripts_spider.py
+++ b/seeking-alpha-scraper/transcripts_spider.py
@ -1,73 +0,0 @@
-import scrapy
-import re
-
-class TranscriptSpider(scrapy.Spider):
-    name = 'transcripts'
-
-    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
-    def cleanhtml(raw_html):
-	cleanr = re.compile('<.*?>')
-	cleantext = re.sub(cleanr, '', raw_html)
-	return cleantext
-	
-    def parse(self, response):
-        # follow links to transcript pages
-        for href in response.css('.dashboard-article-link::attr(href)').extract():
-            yield scrapy.Request(response.urljoin(href),
-                                 callback=self.parse_transcript)
-
-        # follow pagination links
-        #next_page = response.css('li.next a::attr(href)').extract_first()
-        #if next_page is not None:
-        #    next_page = response.urljoin(next_page)
-        #    yield scrapy.Request(next_page, callback=self.parse)
-    
-    def parse_transcript(self, response):
-	i = 4
-        def extract_with_css(query):
-            return response.css(query).extract_first().strip()
-	
-	body = response.css('div#a-body p.p1')
-	chunks = body.css('p.p1')
-	firstline = chunks[0].css('p::text').extract()
-	ticker = chunks.css('a::text').extract_first()
-	if ":" in ticker: 
-	    ticker = ticker.split(':')[1]
-	
-	name = re.compile('([A-z -]* - [A-z ,&-]*)')
-	execs = []
-	analysts = []
-	
-	nextLine = chunks[i].css('p::text').extract_first()
-	while re.match(name, nextLine) is not None:
-	    execs.append(nextLine)
-	    i += i
-	    nextLine = chunks[i].css('p::text').extract_first()
-	print "DONE EXECS"
-	print i
-	print "Next line: "+nextLine
-	while re.match(name, nextLine) is not None:
-	    analysts.append(nextLine)
-	    i += i
-	    nextLine = chunks[i].css('p::text').extract_first()
-	print "DONE ANALYSTS"
-	
-	print execs
-	print "-----------"
-	print analysts
-	print "^^^^^^^^^"
-	
-	#### PLACEHOLDER
-	i = 0
-	while True:
-	    print i ,": " , chunks[i].css('p::text').extract_first()
-	    print i ,": " , chunks[i].css('strong::text').extract_first()
-	    i += 1
-	    
-        #yield {
-	#    'company': firstline[0].split(" (", 1)[0],
-         #   'stockmarket': firstline[0].split(" (", 1)[1],
-	 #   'ticker': ticker,
-	#    'title': chunks[1].css('p::text').extract_first(),
-	#    'date': chunks[2].css('p::text').extract_first()
-        #}