1.0 fuck yes
This commit is contained in:
parent
f0998ecd98
commit
57800c7114
17 changed files with 761 additions and 73 deletions
126
scraping-alpha/Scraping_Alpha/JSONtoSQL.py
Normal file
126
scraping-alpha/Scraping_Alpha/JSONtoSQL.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
#
|
||||
# Title: Scraping Alpha
|
||||
# Version: 1.0
|
||||
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||
#
|
||||
# This file is a part of Scraping Alpha, a series of scripts to scrape
|
||||
# earnings call transcripts from seekingalpha.com and present them as useful
|
||||
# SQL.
|
||||
#
|
||||
# This file takes the `transcripts.json` file output of `transcript_spider.py`
|
||||
# and converts it into SQL.
|
||||
#
|
||||
# This file should be located in the same directory as `transcripts.json`, and
|
||||
# is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
|
||||
# the output file.
|
||||
#
|
||||
|
||||
import json
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
|
||||
|
||||
json_data=open('transcripts.json').read()
|
||||
|
||||
data = json.loads(json_data)
|
||||
|
||||
executives = []
|
||||
analysts = []
|
||||
|
||||
# For each transcript, creates new, separate arrays of executives and analysts
|
||||
# for their own database tables, replacing their tuples in the transcript with
|
||||
# their database keys.
|
||||
for entry in data:
|
||||
indexExec = len(executives)+1
|
||||
indexAnal = len(analysts)+1
|
||||
|
||||
newExecs = []
|
||||
for executive in entry['entry']['exec']:
|
||||
if executive not in executives:
|
||||
executives.append(executive)
|
||||
newExecs.append(indexExec)
|
||||
indexExec += 1
|
||||
else:
|
||||
newExecs.append(executives.index(executive) + 1)
|
||||
entry['entry']['exec'] = newExecs
|
||||
|
||||
newAnals = []
|
||||
for analyst in entry['entry']['analysts']:
|
||||
if analyst not in analysts:
|
||||
analysts.append(analyst)
|
||||
newAnals.append(indexAnal)
|
||||
indexAnal += 1
|
||||
else:
|
||||
newAnals.append(analysts.index(analyst) + 1)
|
||||
entry['entry']['analysts'] = newAnals
|
||||
|
||||
# Outputs the SQL file that creates the various tables and populates them with
|
||||
# INSERT statements.
|
||||
print "CREATE TABLE IF NOT EXISTS `execs`"
|
||||
print "("
|
||||
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
|
||||
print "\t`name` VARCHAR(255),"
|
||||
print "\t`position` VARCHAR(255),"
|
||||
print "\t`company` VARCHAR(255),"
|
||||
print "\tPRIMARY KEY(`id`)"
|
||||
print ");\n"
|
||||
|
||||
print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
|
||||
print "\t(0,0,0)",
|
||||
for executive in executives:
|
||||
print ","
|
||||
print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
|
||||
print ";\n"
|
||||
|
||||
print "CREATE TABLE IF NOT EXISTS `analysts`"
|
||||
print "("
|
||||
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
|
||||
print "\t`name` VARCHAR(255),"
|
||||
print "\t`company` VARCHAR(255),"
|
||||
print "\tPRIMARY KEY(`id`)"
|
||||
print ");\n"
|
||||
|
||||
print "INSERT INTO `analysts` (`name`, `company`) VALUES"
|
||||
print "\t(0,0)",
|
||||
for analyst in analysts:
|
||||
print ","
|
||||
print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
|
||||
print ";\n"
|
||||
|
||||
print "CREATE TABLE IF NOT EXISTS `transcripts`"
|
||||
print "("
|
||||
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
|
||||
print "\t`title` VARCHAR(255),"
|
||||
print "\t`company` VARCHAR(255),"
|
||||
print "\t`execs` VARCHAR(255),"
|
||||
print "\t`analysts` VARCHAR(255),"
|
||||
print "\t`transcript` TEXT,"
|
||||
print "\tPRIMARY KEY(`id`)"
|
||||
print ");\n"
|
||||
|
||||
print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
|
||||
print "\t(0,0,0,0,0)",
|
||||
for entry in data:
|
||||
tran = entry['entry']
|
||||
print ","
|
||||
print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
|
||||
print ";\n"
|
||||
|
||||
print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
|
||||
print "("
|
||||
print "\t`exec_id` INT NOT NULL,"
|
||||
print "\t`transcript_id` INT NOT NULL,"
|
||||
print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
|
||||
print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
|
||||
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
|
||||
print ");\n"
|
||||
|
||||
print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
|
||||
print "("
|
||||
print "\t`analyst_id` INT NOT NULL,"
|
||||
print "\t`transcript_id` INT NOT NULL,"
|
||||
print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
|
||||
print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
|
||||
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
|
||||
print ");"
|
0
scraping-alpha/Scraping_Alpha/Scraping_Alpha/__init__.py
Normal file
0
scraping-alpha/Scraping_Alpha/Scraping_Alpha/__init__.py
Normal file
BIN
scraping-alpha/Scraping_Alpha/Scraping_Alpha/__init__.pyc
Normal file
BIN
scraping-alpha/Scraping_Alpha/Scraping_Alpha/__init__.pyc
Normal file
Binary file not shown.
14
scraping-alpha/Scraping_Alpha/Scraping_Alpha/items.py
Normal file
14
scraping-alpha/Scraping_Alpha/Scraping_Alpha/items.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScrapingAlphaItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
56
scraping-alpha/Scraping_Alpha/Scraping_Alpha/middlewares.py
Normal file
56
scraping-alpha/Scraping_Alpha/Scraping_Alpha/middlewares.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class ScrapingAlphaSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
11
scraping-alpha/Scraping_Alpha/Scraping_Alpha/pipelines.py
Normal file
11
scraping-alpha/Scraping_Alpha/Scraping_Alpha/pipelines.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
class ScrapingAlphaPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
91
scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.py
Normal file
91
scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for Scraping_Alpha project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Scraping_Alpha'
|
||||
|
||||
SPIDER_MODULES = ['Scraping_Alpha.spiders']
|
||||
NEWSPIDER_MODULE = 'Scraping_Alpha.spiders'
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'Scraping_Alpha (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'Scraping_Alpha.middlewares.ScrapingAlphaSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'Scraping_Alpha.middlewares.MyCustomDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'Scraping_Alpha.pipelines.SomePipeline': 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
BIN
scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.pyc
Normal file
BIN
scraping-alpha/Scraping_Alpha/Scraping_Alpha/settings.pyc
Normal file
Binary file not shown.
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
Binary file not shown.
|
@ -0,0 +1,171 @@
|
|||
#
|
||||
# Title: Scraping Alpha
|
||||
# Version: 1.0
|
||||
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||
#
|
||||
# This file is a part of Scraping Alpha, a series of scripts to scrape
|
||||
# earnings call transcripts from seekingalpha.com and present them as useful
|
||||
# SQL.
|
||||
#
|
||||
# This file is the webspider that Scrapy uses to retrieve the information from
|
||||
# the website. Left unattended, it will scrape all 4,000+ pages of results.
|
||||
#
|
||||
# To interrupt this behaviour and still be able to proceed with the other
|
||||
# steps, cancel the script with CTRL+Z. This will likely leave an unfinished
|
||||
# JSON item at the end of the output file. To clear this up, open the file
|
||||
# in vim and type the following keys:
|
||||
# 'G', 'V', 'd', '$', 'i', 'BACKSPACE', 'ENTER', ']', 'ESC', ':wp', 'ENTER'
|
||||
# This will truncate the file at the last complete record and seal it off.
|
||||
#
|
||||
# For installation instructions for Scrapy, visit
|
||||
# <doc.scrapy.org/en/latest/intro/install.html>. This file should be in the
|
||||
# `spiders` directory of the project, and is run via 'scrapy crawl transcripts
|
||||
# -o transcripts.json' at the command line (the output file will be placed
|
||||
# in the directory the Terminal is currently in).
|
||||
#
|
||||
|
||||
# Some of the <Exec, Position> tuples are separate by an em- rather than an
|
||||
# en-dash, which isn't featured in the ASCII charset, hence the below line:
|
||||
#-*- coding: utf-8 -*-
|
||||
|
||||
import scrapy
|
||||
# This enum lists the stages of each transcript.
|
||||
from enum import Enum
|
||||
Stage = Enum('Stage', 'preamble execs analysts body')
|
||||
# Some transcript preambles are concatenated on a single line. This list is used
|
||||
# To separate the title and date sections of the string.
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||
transcripts = {}
|
||||
|
||||
class TranscriptSpider(scrapy.Spider):
|
||||
name = 'transcripts'
|
||||
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
|
||||
|
||||
def parse(self, response):
|
||||
# Follows each transcript page's link from the given index page.
|
||||
for href in response.css('.dashboard-article-link::attr(href)').extract():
|
||||
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
|
||||
|
||||
# Follows the pagination links at the bottom of given index page.
|
||||
next_page = response.css('li.next a::attr(href)').extract_first()
|
||||
if next_page is not None:
|
||||
next_page = response.urljoin(next_page)
|
||||
yield scrapy.Request(next_page, callback=self.parse)
|
||||
|
||||
def parse_transcript(self, response):
|
||||
i = 0
|
||||
transcript = {}
|
||||
details = {}
|
||||
execs = []
|
||||
analysts = []
|
||||
script = []
|
||||
mode = 1
|
||||
|
||||
# As the pages are represented by a series of `<p>` elements, all with
|
||||
# the same class `.p1` and no unique identfiers, we have to do this the
|
||||
# old-fashioned way - breaking it into chunks and iterating over them.
|
||||
body = response.css('div#a-body p.p1')
|
||||
chunks = body.css('p.p1')
|
||||
while i < len(chunks):
|
||||
# If the current line is a heading and we're not currently going
|
||||
# through the transcript body (where headings represent speakers),
|
||||
# change the current section flag to the next section.
|
||||
if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
|
||||
currStage = Stage(mode)
|
||||
# If we're on the preamble stage, each bit of data is extracted
|
||||
# separately as they all have their own key in the JSON.
|
||||
if currStage == Stage['preamble']:
|
||||
# If we're on the first line of the preamble, that's the
|
||||
# company name, stock exchange and ticker acroynm (or should
|
||||
# be - see below)
|
||||
if i == 0:
|
||||
# Checks to see if the second line is a heading. If not,
|
||||
# everything is fine.
|
||||
if len(chunks[1].css('strong::text').extract()) == 0:
|
||||
details['company'] = chunks[i].css('p::text').extract_first()
|
||||
if " (" in details['company']:
|
||||
details['company'] = details['company'].split(' (')[0]
|
||||
# If a specific stock exchange is not listed, it
|
||||
# defaults to NYSE
|
||||
details['exchange'] = "NYSE"
|
||||
details['ticker'] = chunks.css('a::text').extract_first()
|
||||
if ":" in details['ticker']:
|
||||
ticker = details['ticker'].split(':')
|
||||
details['exchange'] = ticker[0]
|
||||
details['ticker'] = ticker[1]
|
||||
# However, if it is, that means this line contains the
|
||||
# full, concatenated preamble, so everything must be
|
||||
# extracted here
|
||||
else:
|
||||
details['company'] = chunks[i].css('p::text').extract_first()
|
||||
if " (" in details['company']:
|
||||
details['company'] = details['company'].split(' (')[0]
|
||||
# if a specific stock exchange is not listed, default to NYSE
|
||||
details['exchange'] = "NYSE"
|
||||
details['ticker'] = chunks.css('a::text').extract_first()
|
||||
if ":" in details['ticker']:
|
||||
ticker = details['ticker'].split(':')
|
||||
details['exchange'] = ticker[0]
|
||||
details['ticker'] = ticker[1]
|
||||
titleAndDate = chunks[i].css('p::text').extract[1]
|
||||
for date in months:
|
||||
if date in titleAndDate:
|
||||
splits = titleAndDate.split(date)
|
||||
details['title'] = splits[0]
|
||||
details['date'] = dates + splits[1]
|
||||
# Otherwise, we're onto the title line.
|
||||
elif i == 1:
|
||||
title = chunks[i].css('p::text').extract_first()
|
||||
# This should never be the case, but just to be careful
|
||||
# I'm leaving it in.
|
||||
if len(title) <= 0:
|
||||
title = "NO TITLE"
|
||||
details['title'] = title
|
||||
# Or the date line.
|
||||
elif i == 2:
|
||||
details['date'] = chunks[i].css('p::text').extract_first()
|
||||
# If we're onto the 'Executives' section, we create a list of
|
||||
# all of their names, positions and company name (from the
|
||||
# preamble).
|
||||
elif currStage == Stage['execs']:
|
||||
anExec = chunks[i].css('p::text').extract_first().split(" - ")
|
||||
# This covers if the execs are separated with an em- rather
|
||||
# than an en-dash (see above).
|
||||
if len(anExec) <= 1:
|
||||
anExec = chunks[i].css('p::text').extract_first().split(" – ")
|
||||
name = anExec[0]
|
||||
if len(anExec) > 1:
|
||||
position = anExec[1]
|
||||
# Again, this should never be the case, as an Exec-less
|
||||
# company would find it hard to get much done.
|
||||
else:
|
||||
position = ""
|
||||
execs.append((name,position,details['company']))
|
||||
# This does the same, but with the analysts (which never seem
|
||||
# to be separated by em-dashes for some reason).
|
||||
elif currStage == Stage['analysts']:
|
||||
name = chunks[i].css('p::text').extract_first().split(" - ")[0]
|
||||
company = chunks[i].css('p::text').extract_first().split(" - ")[1]
|
||||
analysts.append((name,company))
|
||||
# This strips the transcript body of everything except simple
|
||||
# HTML, and stores that.
|
||||
elif currStage == Stage['body']:
|
||||
line = chunks[i].css('p::text').extract_first()
|
||||
html = "p>"
|
||||
if line is None:
|
||||
line = chunks[i].css('strong::text').extract_first()
|
||||
html = "h1>"
|
||||
script.append("<"+html+line+"</"+html)
|
||||
else:
|
||||
mode += 1
|
||||
i += 1
|
||||
|
||||
# Adds the various arrays to the dictionary for the transcript
|
||||
details['exec'] = execs
|
||||
details['analysts'] = analysts
|
||||
details['transcript'] = ''.join(script)
|
||||
|
||||
# Adds this transcript to the dictionary of all scraped
|
||||
# transcripts, and yield that for the output
|
||||
transcript["entry"] = details
|
||||
yield transcript
|
Binary file not shown.
104
scraping-alpha/Scraping_Alpha/execsAndAnalysts.py
Normal file
104
scraping-alpha/Scraping_Alpha/execsAndAnalysts.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
#
|
||||
# Title: Scraping Alpha
|
||||
# Version: 1.0
|
||||
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||
#
|
||||
# This file is a part of Scraping Alpha, a series of scripts to scrape
|
||||
# earnings call transcripts from seekingalpha.com and present them as useful
|
||||
# SQL.
|
||||
#
|
||||
# This file takes the `transcripts.sql` file exported from the database
|
||||
# created using the output file of `JSONtoSQL.py` after the following query:
|
||||
# SELECT `id`, `execs`, `analysts` FROM `transcripts`
|
||||
# It creates from this two `execs.sql` and `analysts.sql` for creating linking
|
||||
# tables in the database.
|
||||
#
|
||||
# This file should be located in the same directory as `transcripts.sql`, and
|
||||
# is run via 'python execsAndAnalysts'.
|
||||
#
|
||||
|
||||
import sys
|
||||
import codecs
|
||||
import os
|
||||
from shutil import copyfile
|
||||
import fileinput
|
||||
|
||||
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
|
||||
|
||||
start = 0
|
||||
|
||||
# Creates a temporary copy in case something goes Pete Tong.
|
||||
copyfile("transcripts.sql", "transcripts.sql.tmp")
|
||||
|
||||
# Trims everything from the export except for the INSERT statement.
|
||||
for line in fileinput.FileInput("transcripts.sql.tmp",inplace=1):
|
||||
if start == 0:
|
||||
if "INSERT INTO `transcripts`" in line:
|
||||
start = 1
|
||||
print "INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES"
|
||||
else:
|
||||
if line == "\n":
|
||||
start = 0
|
||||
else:
|
||||
print "\t"+line,
|
||||
|
||||
# Copies the produced file to create both the output files, then deletes the
|
||||
# temporary file.
|
||||
copyfile("transcripts.sql.tmp", "execs.sql")
|
||||
copyfile("transcripts.sql.tmp", "analysts.sql")
|
||||
os.remove("transcripts.sql.tmp")
|
||||
|
||||
# Converts each line "(x, '0;...;n')" in the file to n separate INSERTs, one
|
||||
# for each executive.
|
||||
start = 0
|
||||
nL = ""
|
||||
for line in fileinput.FileInput("execs.sql",inplace=1):
|
||||
if start == 0:
|
||||
start = 1
|
||||
else:
|
||||
bits = line.split(', ')
|
||||
tID = bits[0].strip('\t').strip('(')
|
||||
execs = bits[1].split(';')
|
||||
newLines = ""
|
||||
for execID in execs:
|
||||
newLines = newLines + nL + "\t("+tID+", "+execID.strip('\'')+"),"
|
||||
line = line.replace(line, newLines)
|
||||
nL = "\n"
|
||||
print line,
|
||||
|
||||
# Does the same for the analysts.
|
||||
start = 0
|
||||
nL = ""
|
||||
for line in fileinput.FileInput("analysts.sql",inplace=1):
|
||||
if start == 0:
|
||||
start = 1
|
||||
line = line.replace(line, "INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES\n")
|
||||
else:
|
||||
bits = line.split(', ')
|
||||
tID = bits[0].strip('\t').strip('(')
|
||||
# As it is possible for there to be no analysts in a call, this ignores
|
||||
# blank results.
|
||||
if "''" not in bits[2]:
|
||||
analysts = bits[2].split(';')
|
||||
newLines = ""
|
||||
for analystID in analysts:
|
||||
# This stops the final transcript from getting an additional,
|
||||
# `analyst_id`-less INSERT
|
||||
if analystID != '\n':
|
||||
newLines = newLines + nL + "\t("+tID+", "+analystID.strip('\'').strip('\'),\n')+"),"
|
||||
line = line.replace(line, newLines)
|
||||
nL = "\n"
|
||||
else:
|
||||
line = ""
|
||||
print line,
|
||||
|
||||
# Replace the final comma at the end of each file with a semicolon, to make it
|
||||
# valid SQL
|
||||
with open("execs.sql", 'rb+') as filehandle:
|
||||
filehandle.seek(-1, os.SEEK_END)
|
||||
filehandle.truncate()
|
||||
filehandle.write(";")
|
||||
with open("analysts.sql", 'rb+') as filehandle:
|
||||
filehandle.seek(-1, os.SEEK_END)
|
||||
filehandle.truncate()
|
||||
filehandle.write(";")
|
11
scraping-alpha/Scraping_Alpha/scrapy.cfg
Normal file
11
scraping-alpha/Scraping_Alpha/scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
|||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.org/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = Scraping_Alpha.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = Scraping_Alpha
|
63
scraping-alpha/Scraping_Alpha/transcripts.json
Normal file
63
scraping-alpha/Scraping_Alpha/transcripts.json
Normal file
File diff suppressed because one or more lines are too long
110
scraping-alpha/Scraping_Alpha/transcripts.sql
Normal file
110
scraping-alpha/Scraping_Alpha/transcripts.sql
Normal file
|
@ -0,0 +1,110 @@
|
|||
-- phpMyAdmin SQL Dump
|
||||
-- version 4.0.10.14
|
||||
-- http://www.phpmyadmin.net
|
||||
--
|
||||
-- Host: localhost:3306
|
||||
-- Generation Time: Dec 26, 2016 at 02:39 PM
|
||||
-- Server version: 5.5.52-cll-lve
|
||||
-- PHP Version: 5.6.20
|
||||
|
||||
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
|
||||
SET time_zone = "+00:00";
|
||||
|
||||
|
||||
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
||||
/*!40101 SET NAMES utf8 */;
|
||||
|
||||
--
|
||||
-- Database: `bengoldsworthy`
|
||||
--
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `transcripts`
|
||||
--
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `transcripts` (
|
||||
`id` int(11) NOT NULL AUTO_INCREMENT,
|
||||
`title` varchar(255) DEFAULT NULL,
|
||||
`company` varchar(255) DEFAULT NULL,
|
||||
`execs` varchar(255) DEFAULT NULL,
|
||||
`analysts` varchar(255) DEFAULT NULL,
|
||||
`transcript` text,
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `id` (`id`)
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=63 ;
|
||||
|
||||
--
|
||||
-- Dumping data for table `transcripts`
|
||||
--
|
||||
|
||||
INSERT INTO `transcripts` (`id`, `execs`, `analysts`) VALUES
|
||||
(1, '0', '0'),
|
||||
(2, '1;2;3;4', '1'),
|
||||
(3, '5;6;7', '2;3;4;5;6;7;8;9;10'),
|
||||
(4, '8;9;10', '11;12;13;14;15;16;17;18;19;20;21;22;23;24;25'),
|
||||
(5, '11;12;13;14', '26;27;28'),
|
||||
(6, '15;16;17;18;19', '29;30;31;32;33;34;35;36;37;38;39'),
|
||||
(7, '20;21;22;23', '40;41;42;43;44;45;46;47;48'),
|
||||
(8, '24;25;26', '49;50;51;52;53;54;55;56;57;58;59;60;61;62;63;64;65;66;67;68'),
|
||||
(9, '27;28;29;30', '69;70;71'),
|
||||
(10, '31;32', '72;73;74;75;76;77;78;79;80;81'),
|
||||
(11, '33;34;35', '82;83;84;85;86;87;88;89;90;91;92;93;94;95'),
|
||||
(12, '36;37;38;39', '96;97;98;81;99'),
|
||||
(13, '40;41;42', '100;101;102;103;104;105;106'),
|
||||
(14, '43;44;45;46;47;48;49;50;51', '107;108;109;110;111;112;113;114;115;116;117'),
|
||||
(15, '52;53;54', '13;118;119;120;23;121'),
|
||||
(16, '55;56;57', '122;39;123;124;125;126'),
|
||||
(17, '58;59;60;61', '127;128;129;130;131'),
|
||||
(18, '62;63', '132;133'),
|
||||
(19, '64;65;66', '134;135;136;137;138;139;140;141;142'),
|
||||
(20, '67;68', '141;143;144;145;146;142;147;148;149;95;150;151;152;153'),
|
||||
(21, '69;70;71', '154;155;156;157;158;159;160;161;162;163;164;165;166;167;168;169;170'),
|
||||
(22, '72;73;74', '171;172;173;174'),
|
||||
(23, '75;76;77;78', '175;176;177;178;179;180;181'),
|
||||
(24, '79;80;81', '182;183'),
|
||||
(25, '82;83;84', '184;185;186;187;188;189;190;191;192;193;194;195;196;197;198'),
|
||||
(26, '85;86;87;88', '199;200;201;202;203;204;205;206;207;208;209;210;211;212;213'),
|
||||
(27, '89;90;91;92;93', '214;215;216;44;217;218;41;219;45;220'),
|
||||
(28, '94;95;96', '71;70'),
|
||||
(29, '97;98;99', ''),
|
||||
(30, '100;101', '221;222;131'),
|
||||
(31, '102;103;104;105', '223;224;225'),
|
||||
(32, '106;107', '226;85;227;228;229;230;231;232;112;233;234;235'),
|
||||
(33, '108;109', '236;237;238'),
|
||||
(34, '110;111', '239;240;241;242;243;244'),
|
||||
(35, '112;113;114', '245;246'),
|
||||
(36, '115;116;117;118', '247;248'),
|
||||
(37, '119;120;121', '249;250'),
|
||||
(38, '122;123', '251;252;253'),
|
||||
(39, '124;125;126', '254;255;256;257;258;259;260;261;262'),
|
||||
(40, '127;128;129', '263;264;265'),
|
||||
(41, '130;131;132;133', '266;267'),
|
||||
(42, '134;135;136;137;138;139', '268;269;270;271;272'),
|
||||
(43, '140;141;142', '190;273;274;275;276;277;278;279;280;186'),
|
||||
(44, '143;144;145', '281;164;282;154;283;284;285;286;287;288;289;290;291'),
|
||||
(45, '146;147;148', '292;293;294'),
|
||||
(46, '149;150;151', '295'),
|
||||
(47, '152;153;154;155', '165;296;166;297;298;299;160;300'),
|
||||
(48, '156;157;158', '301;302;303;304;305;306;307;308'),
|
||||
(49, '159;160;161', '309;310;311'),
|
||||
(50, '162;163', '312;313'),
|
||||
(51, '164;165', '314'),
|
||||
(52, '166;167;168', '315;316;317;318'),
|
||||
(53, '169;170;171', '319;320'),
|
||||
(54, '172;173;174;175;176;177', '321;322;323;324;325;326;327;328'),
|
||||
(55, '178;179;180', '329;330;331;332;333'),
|
||||
(56, '181;182', '334;335'),
|
||||
(57, '183;184;185', '336;337;338;339;340;341;342;343'),
|
||||
(58, '186;187;188;189;190;191;192;193;194;195', '344;345;238;346;347;348;349;350;351;352;353;237;354'),
|
||||
(59, '196;197;198', '355;356;357;358;359'),
|
||||
(60, '199;200;201', '360;361'),
|
||||
(61, '202;203', '362;363'),
|
||||
(62, '204;205;206', '364;365');
|
||||
|
||||
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
|
@ -1,73 +0,0 @@
|
|||
import scrapy
|
||||
import re
|
||||
|
||||
class TranscriptSpider(scrapy.Spider):
|
||||
name = 'transcripts'
|
||||
|
||||
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
|
||||
def cleanhtml(raw_html):
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, '', raw_html)
|
||||
return cleantext
|
||||
|
||||
def parse(self, response):
|
||||
# follow links to transcript pages
|
||||
for href in response.css('.dashboard-article-link::attr(href)').extract():
|
||||
yield scrapy.Request(response.urljoin(href),
|
||||
callback=self.parse_transcript)
|
||||
|
||||
# follow pagination links
|
||||
#next_page = response.css('li.next a::attr(href)').extract_first()
|
||||
#if next_page is not None:
|
||||
# next_page = response.urljoin(next_page)
|
||||
# yield scrapy.Request(next_page, callback=self.parse)
|
||||
|
||||
def parse_transcript(self, response):
|
||||
i = 4
|
||||
def extract_with_css(query):
|
||||
return response.css(query).extract_first().strip()
|
||||
|
||||
body = response.css('div#a-body p.p1')
|
||||
chunks = body.css('p.p1')
|
||||
firstline = chunks[0].css('p::text').extract()
|
||||
ticker = chunks.css('a::text').extract_first()
|
||||
if ":" in ticker:
|
||||
ticker = ticker.split(':')[1]
|
||||
|
||||
name = re.compile('([A-z -]* - [A-z ,&-]*)')
|
||||
execs = []
|
||||
analysts = []
|
||||
|
||||
nextLine = chunks[i].css('p::text').extract_first()
|
||||
while re.match(name, nextLine) is not None:
|
||||
execs.append(nextLine)
|
||||
i += i
|
||||
nextLine = chunks[i].css('p::text').extract_first()
|
||||
print "DONE EXECS"
|
||||
print i
|
||||
print "Next line: "+nextLine
|
||||
while re.match(name, nextLine) is not None:
|
||||
analysts.append(nextLine)
|
||||
i += i
|
||||
nextLine = chunks[i].css('p::text').extract_first()
|
||||
print "DONE ANALYSTS"
|
||||
|
||||
print execs
|
||||
print "-----------"
|
||||
print analysts
|
||||
print "^^^^^^^^^"
|
||||
|
||||
#### PLACEHOLDER
|
||||
i = 0
|
||||
while True:
|
||||
print i ,": " , chunks[i].css('p::text').extract_first()
|
||||
print i ,": " , chunks[i].css('strong::text').extract_first()
|
||||
i += 1
|
||||
|
||||
#yield {
|
||||
# 'company': firstline[0].split(" (", 1)[0],
|
||||
# 'stockmarket': firstline[0].split(" (", 1)[1],
|
||||
# 'ticker': ticker,
|
||||
# 'title': chunks[1].css('p::text').extract_first(),
|
||||
# 'date': chunks[2].css('p::text').extract_first()
|
||||
#}
|
Reference in a new issue