Started on slides
This commit is contained in:
parent
f71a545201
commit
75e3dc5790
6 changed files with 86 additions and 6 deletions
|
@ -79,3 +79,7 @@ It creates from this two files (`execs.sql` and `analysts.sql`). Import them
|
||||||
into your DBMS to create two linking tables. The final instruction of
|
into your DBMS to create two linking tables. The final instruction of
|
||||||
`analysts.sql` then deletes the superfluous `execs` and `analysts` columns from
|
`analysts.sql` then deletes the superfluous `execs` and `analysts` columns from
|
||||||
the `transcripts` table (and for this reason, `execs.sql` must be imported first).
|
the `transcripts` table (and for this reason, `execs.sql` must be imported first).
|
||||||
|
|
||||||
|
### Future
|
||||||
|
|
||||||
|
Harvesting the URLs of slide images shouldn't be too hard to implement - `slides_spider.py` should in theory to this, but the link to a transcript's slides is added to the page later via Javascript, which means at the moment it throws up a load of HTTP 200 status codes and nowt else. [Scrapy+Splash](https://github.com/scrapy-plugins/scrapy-splash) may be the solution, however.
|
|
@ -0,0 +1,79 @@
|
||||||
|
#
|
||||||
|
# Title: Scraping Alpha
|
||||||
|
# Version: 1.0
|
||||||
|
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||||
|
#
|
||||||
|
# This file is a part of Scraping Alpha, a series of scripts to scrape
|
||||||
|
# earnings call transcripts from seekingalpha.com and present them as useful
|
||||||
|
# SQL.
|
||||||
|
#
|
||||||
|
# This file is the webspider that Scrapy uses to retrieve slides.
|
||||||
|
#
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
urls = []
|
||||||
|
# A transcript record can be uniquely identified using it's company name + date.
|
||||||
|
uniqueID = ""
|
||||||
|
# Some transcript preambles are concatenated on a single line. This list is used
|
||||||
|
# To separate the title and date sections of the string.
|
||||||
|
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||||
|
|
||||||
|
class SlidesSpider(scrapy.Spider):
|
||||||
|
name = 'slides'
|
||||||
|
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# Follows each transcript page's link from the given index page.
|
||||||
|
for href in response.css('.dashboard-article-link::attr(href)').extract():
|
||||||
|
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
|
||||||
|
|
||||||
|
# Follows the pagination links at the bottom of given index page.
|
||||||
|
next_page = response.css('li.next a::attr(href)').extract_first()
|
||||||
|
if next_page is not None:
|
||||||
|
next_page = response.urljoin(next_page)
|
||||||
|
yield scrapy.Request(next_page, callback=self.parse)
|
||||||
|
|
||||||
|
def parse_transcript(self, response):
|
||||||
|
slides = response.css('li#slides a::attr(href)').extract_first()
|
||||||
|
if slides is not None:
|
||||||
|
body = response.css('div#a-body p.p1')
|
||||||
|
chunks = body.css('p.p1')
|
||||||
|
i = 0
|
||||||
|
while i < 3:
|
||||||
|
# If we're on the first line of the preamble, that's the
|
||||||
|
# company name, stock exchange and ticker acroynm (or should
|
||||||
|
# be - see below)
|
||||||
|
if i == 0:
|
||||||
|
# Checks to see if the second line is a heading. If not,
|
||||||
|
# everything is fine.
|
||||||
|
if len(chunks[1].css('strong::text').extract()) == 0:
|
||||||
|
uniqueID = chunks[i].css('p::text').extract_first()
|
||||||
|
if " (" in uniqueID:
|
||||||
|
uniqueID = uniqueID.split(' (')[0]
|
||||||
|
i = 2
|
||||||
|
# However, if it is, that means this line contains the
|
||||||
|
# full, concatenated preamble, so everything must be
|
||||||
|
# extracted here
|
||||||
|
else:
|
||||||
|
uniqueID = chunks[i].css('p::text').extract_first()
|
||||||
|
if " (" in uniqueID:
|
||||||
|
uniqueID = uniqueID.split(' (')[0]
|
||||||
|
titleAndDate = chunks[i].css('p::text').extract[1]
|
||||||
|
for date in months:
|
||||||
|
if date in titleAndDate:
|
||||||
|
splits = titleAndDate.split(date)
|
||||||
|
uniqueID = uniqueID + ";" + date + splits[1]
|
||||||
|
i = 3
|
||||||
|
# Otherwise, we're onto the date line.
|
||||||
|
elif i == 2:
|
||||||
|
uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first()
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
slides = response.urljoin(slides)
|
||||||
|
yield uniqueID
|
||||||
|
#yield scrapy.Request(sides, callback=self.parse_slides)
|
||||||
|
|
||||||
|
def parse_slides(self, response):
|
||||||
|
urls = response.css('figure img::attr(src)').extract()
|
||||||
|
yield uniqueID + "\\\\" + ';'.join(urls)
|
||||||
|
|
Binary file not shown.
|
@ -1,4 +1,5 @@
|
||||||
#
|
#-*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Title: Scraping Alpha
|
# Title: Scraping Alpha
|
||||||
# Version: 1.0
|
# Version: 1.0
|
||||||
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||||
|
@ -24,10 +25,6 @@
|
||||||
# in the directory the Terminal is currently in).
|
# in the directory the Terminal is currently in).
|
||||||
#
|
#
|
||||||
|
|
||||||
# Some of the <Exec, Position> tuples are separate by an em- rather than an
|
|
||||||
# en-dash, which isn't featured in the ASCII charset, hence the below line:
|
|
||||||
#-*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
# This enum lists the stages of each transcript.
|
# This enum lists the stages of each transcript.
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -112,7 +109,7 @@ class TranscriptSpider(scrapy.Spider):
|
||||||
if date in titleAndDate:
|
if date in titleAndDate:
|
||||||
splits = titleAndDate.split(date)
|
splits = titleAndDate.split(date)
|
||||||
details['title'] = splits[0]
|
details['title'] = splits[0]
|
||||||
details['date'] = dates + splits[1]
|
details['date'] = date + splits[1]
|
||||||
# Otherwise, we're onto the title line.
|
# Otherwise, we're onto the title line.
|
||||||
elif i == 1:
|
elif i == 1:
|
||||||
title = chunks[i].css('p::text').extract_first()
|
title = chunks[i].css('p::text').extract_first()
|
||||||
|
|
Binary file not shown.
0
scraping-alpha/Scraping_Alpha/urls.json
Normal file
0
scraping-alpha/Scraping_Alpha/urls.json
Normal file
Reference in a new issue