Just Scraping Alpha

This commit is contained in:
Rumps 2017-01-13 17:56:41 +00:00
parent 75e3dc5790
commit b6c1660f04
24 changed files with 8 additions and 1000 deletions

126
Scraping_Alpha/JSONtoSQL.py Normal file
View file

@ -0,0 +1,126 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file takes the `transcripts.json` file output of `transcript_spider.py`
# and converts it into SQL.
#
# This file should be located in the same directory as `transcripts.json`, and
# is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
# the output file.
#
import json
import sys
import codecs
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
json_data=open('transcripts.json').read()
data = json.loads(json_data)
executives = []
analysts = []
# For each transcript, creates new, separate arrays of executives and analysts
# for their own database tables, replacing their tuples in the transcript with
# their database keys.
for entry in data:
indexExec = len(executives)+1
indexAnal = len(analysts)+1
newExecs = []
for executive in entry['entry']['exec']:
if executive not in executives:
executives.append(executive)
newExecs.append(indexExec)
indexExec += 1
else:
newExecs.append(executives.index(executive) + 1)
entry['entry']['exec'] = newExecs
newAnals = []
for analyst in entry['entry']['analysts']:
if analyst not in analysts:
analysts.append(analyst)
newAnals.append(indexAnal)
indexAnal += 1
else:
newAnals.append(analysts.index(analyst) + 1)
entry['entry']['analysts'] = newAnals
# Outputs the SQL file that creates the various tables and populates them with
# INSERT statements.
print "CREATE TABLE IF NOT EXISTS `execs`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`position` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
print "\t(0,0,0)",
for executive in executives:
print ","
print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `analysts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `analysts` (`name`, `company`) VALUES"
print "\t(0,0)",
for analyst in analysts:
print ","
print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `transcripts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`title` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\t`execs` VARCHAR(255),"
print "\t`analysts` VARCHAR(255),"
print "\t`transcript` TEXT,"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
print "\t(0,0,0,0,0)",
for entry in data:
tran = entry['entry']
print ","
print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
print "("
print "\t`exec_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");\n"
print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
print "("
print "\t`analyst_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");"

Binary file not shown.

View file

@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapingAlphaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View file

@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ScrapingAlphaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View file

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapingAlphaPipeline(object):
def process_item(self, item, spider):
return item

View file

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# Scrapy settings for Scraping_Alpha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Scraping_Alpha'
SPIDER_MODULES = ['Scraping_Alpha.spiders']
NEWSPIDER_MODULE = 'Scraping_Alpha.spiders'
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Scraping_Alpha (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Scraping_Alpha.middlewares.ScrapingAlphaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Scraping_Alpha.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'Scraping_Alpha.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

Binary file not shown.

View file

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

Binary file not shown.

View file

@ -0,0 +1,79 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file is the webspider that Scrapy uses to retrieve slides.
#
import scrapy
urls = []
# A transcript record can be uniquely identified using it's company name + date.
uniqueID = ""
# Some transcript preambles are concatenated on a single line. This list is used
# To separate the title and date sections of the string.
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
class SlidesSpider(scrapy.Spider):
name = 'slides'
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def parse(self, response):
# Follows each transcript page's link from the given index page.
for href in response.css('.dashboard-article-link::attr(href)').extract():
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
# Follows the pagination links at the bottom of given index page.
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
slides = response.css('li#slides a::attr(href)').extract_first()
if slides is not None:
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
i = 0
while i < 3:
# If we're on the first line of the preamble, that's the
# company name, stock exchange and ticker acroynm (or should
# be - see below)
if i == 0:
# Checks to see if the second line is a heading. If not,
# everything is fine.
if len(chunks[1].css('strong::text').extract()) == 0:
uniqueID = chunks[i].css('p::text').extract_first()
if " (" in uniqueID:
uniqueID = uniqueID.split(' (')[0]
i = 2
# However, if it is, that means this line contains the
# full, concatenated preamble, so everything must be
# extracted here
else:
uniqueID = chunks[i].css('p::text').extract_first()
if " (" in uniqueID:
uniqueID = uniqueID.split(' (')[0]
titleAndDate = chunks[i].css('p::text').extract[1]
for date in months:
if date in titleAndDate:
splits = titleAndDate.split(date)
uniqueID = uniqueID + ";" + date + splits[1]
i = 3
# Otherwise, we're onto the date line.
elif i == 2:
uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first()
i += 1
slides = response.urljoin(slides)
yield uniqueID
#yield scrapy.Request(sides, callback=self.parse_slides)
def parse_slides(self, response):
urls = response.css('figure img::attr(src)').extract()
yield uniqueID + "\\\\" + ';'.join(urls)

View file

@ -0,0 +1,168 @@
#-*- coding: utf-8 -*-
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file is the webspider that Scrapy uses to retrieve the information from
# the website. Left unattended, it will scrape all 4,000+ pages of results.
#
# To interrupt this behaviour and still be able to proceed with the other
# steps, cancel the script with CTRL+Z. This will likely leave an unfinished
# JSON item at the end of the output file. To clear this up, open the file
# in vim and type the following keys:
# 'G', 'V', 'd', '$', 'i', 'BACKSPACE', 'ENTER', ']', 'ESC', ':wp', 'ENTER'
# This will truncate the file at the last complete record and seal it off.
#
# For installation instructions for Scrapy, visit
# <doc.scrapy.org/en/latest/intro/install.html>. This file should be in the
# `spiders` directory of the project, and is run via 'scrapy crawl transcripts
# -o transcripts.json' at the command line (the output file will be placed
# in the directory the Terminal is currently in).
#
import scrapy
# This enum lists the stages of each transcript.
from enum import Enum
Stage = Enum('Stage', 'preamble execs analysts body')
# Some transcript preambles are concatenated on a single line. This list is used
# To separate the title and date sections of the string.
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
transcripts = {}
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def parse(self, response):
# Follows each transcript page's link from the given index page.
for href in response.css('.dashboard-article-link::attr(href)').extract():
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
# Follows the pagination links at the bottom of given index page.
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 0
transcript = {}
details = {}
execs = []
analysts = []
script = []
mode = 1
# As the pages are represented by a series of `<p>` elements, all with
# the same class `.p1` and no unique identfiers, we have to do this the
# old-fashioned way - breaking it into chunks and iterating over them.
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
while i < len(chunks):
# If the current line is a heading and we're not currently going
# through the transcript body (where headings represent speakers),
# change the current section flag to the next section.
if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
currStage = Stage(mode)
# If we're on the preamble stage, each bit of data is extracted
# separately as they all have their own key in the JSON.
if currStage == Stage['preamble']:
# If we're on the first line of the preamble, that's the
# company name, stock exchange and ticker acroynm (or should
# be - see below)
if i == 0:
# Checks to see if the second line is a heading. If not,
# everything is fine.
if len(chunks[1].css('strong::text').extract()) == 0:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# If a specific stock exchange is not listed, it
# defaults to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
# However, if it is, that means this line contains the
# full, concatenated preamble, so everything must be
# extracted here
else:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# if a specific stock exchange is not listed, default to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
titleAndDate = chunks[i].css('p::text').extract[1]
for date in months:
if date in titleAndDate:
splits = titleAndDate.split(date)
details['title'] = splits[0]
details['date'] = date + splits[1]
# Otherwise, we're onto the title line.
elif i == 1:
title = chunks[i].css('p::text').extract_first()
# This should never be the case, but just to be careful
# I'm leaving it in.
if len(title) <= 0:
title = "NO TITLE"
details['title'] = title
# Or the date line.
elif i == 2:
details['date'] = chunks[i].css('p::text').extract_first()
# If we're onto the 'Executives' section, we create a list of
# all of their names, positions and company name (from the
# preamble).
elif currStage == Stage['execs']:
anExec = chunks[i].css('p::text').extract_first().split(" - ")
# This covers if the execs are separated with an em- rather
# than an en-dash (see above).
if len(anExec) <= 1:
anExec = chunks[i].css('p::text').extract_first().split(" ")
name = anExec[0]
if len(anExec) > 1:
position = anExec[1]
# Again, this should never be the case, as an Exec-less
# company would find it hard to get much done.
else:
position = ""
execs.append((name,position,details['company']))
# This does the same, but with the analysts (which never seem
# to be separated by em-dashes for some reason).
elif currStage == Stage['analysts']:
name = chunks[i].css('p::text').extract_first().split(" - ")[0]
company = chunks[i].css('p::text').extract_first().split(" - ")[1]
analysts.append((name,company))
# This strips the transcript body of everything except simple
# HTML, and stores that.
elif currStage == Stage['body']:
line = chunks[i].css('p::text').extract_first()
html = "p>"
if line is None:
line = chunks[i].css('strong::text').extract_first()
html = "h1>"
script.append("<"+html+line+"</"+html)
else:
mode += 1
i += 1
# Adds the various arrays to the dictionary for the transcript
details['exec'] = execs
details['analysts'] = analysts
details['transcript'] = ''.join(script)
# Adds this transcript to the dictionary of all scraped
# transcripts, and yield that for the output
transcript["entry"] = details
yield transcript

View file

@ -0,0 +1,111 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file takes the `transcripts.sql` file exported from the database
# created using the output file of `JSONtoSQL.py` after the following query:
# SELECT `id`, `execs`, `analysts` FROM `transcripts`
# It creates from this two `execs.sql` and `analysts.sql` for creating linking
# tables in the database.
#
# This file should be located in the same directory as `transcripts.sql`, and
# is run via 'python execsAndAnalysts'.
#
import sys
import codecs
import os
from shutil import copyfile
import fileinput
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
start = 0
# Creates a temporary copy in case something goes Pete Tong.
copyfile("transcripts.sql", "transcripts.sql.tmp")
# Trims everything from the export except for the INSERT statement.
for line in fileinput.FileInput("transcripts.sql.tmp",inplace=1):
if start == 0:
if "INSERT INTO `transcripts`" in line:
start = 1
print "INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES"
else:
if line == "\n":
start = 0
else:
print "\t"+line,
# Copies the produced file to create both the output files, then deletes the
# temporary file.
copyfile("transcripts.sql.tmp", "execs.sql")
copyfile("transcripts.sql.tmp", "analysts.sql")
os.remove("transcripts.sql.tmp")
# Converts each line "(x, '0;...;n')" in the file to n separate INSERTs, one
# for each executive.
start = 0
nL = ""
for line in fileinput.FileInput("execs.sql",inplace=1):
if start == 0:
start = 1
else:
bits = line.split(', ')
tID = bits[0].strip('\t').strip('(')
execs = bits[1].split(';')
newLines = ""
for execID in execs:
newLines = newLines + nL + "\t("+tID+", "+execID.strip('\'')+"),"
line = line.replace(line, newLines)
nL = "\n"
print line,
# Does the same for the analysts.
start = 0
nL = ""
for line in fileinput.FileInput("analysts.sql",inplace=1):
if start == 0:
start = 1
line = line.replace(line, "INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES\n")
else:
bits = line.split(', ')
tID = bits[0].strip('\t').strip('(')
# As it is possible for there to be no analysts in a call, this ignores
# blank results.
if "''" not in bits[2]:
analysts = bits[2].split(';')
newLines = ""
for analystID in analysts:
# This stops the final transcript from getting an additional,
# `analyst_id`-less INSERT
if analystID != '\n':
newLines = newLines + nL + "\t("+tID+", "+analystID.strip('\'').strip('\'),\n')+"),"
line = line.replace(line, newLines)
nL = "\n"
else:
line = ""
print line,
# Replace the final comma at the end of each file with a semicolon, to make it
# valid SQL
with open("execs.sql", 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
filehandle.write(";")
with open("analysts.sql", 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
filehandle.write(";")
# `analysts.sql` then performs some cleanup on the database.
filehandle.write("\n\nALTER TABLE `transcripts`\n\tDROP COLUMN `execs`,\n\tDROP COLUMN `analysts`;\n\n")
filehandle.write("DELETE FROM `transcripts` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `execs` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `analysts` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;\n")
filehandle.write("DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;\n")

11
Scraping_Alpha/scrapy.cfg Normal file
View file

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = Scraping_Alpha.settings
[deploy]
#url = http://localhost:6800/
project = Scraping_Alpha

0
Scraping_Alpha/urls.json Normal file
View file