Just Scraping Alpha
This commit is contained in:
parent
75e3dc5790
commit
b6c1660f04
24 changed files with 8 additions and 1000 deletions
79
Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
Normal file
79
Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
#
|
||||
# Title: Scraping Alpha
|
||||
# Version: 1.0
|
||||
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
|
||||
#
|
||||
# This file is a part of Scraping Alpha, a series of scripts to scrape
|
||||
# earnings call transcripts from seekingalpha.com and present them as useful
|
||||
# SQL.
|
||||
#
|
||||
# This file is the webspider that Scrapy uses to retrieve slides.
|
||||
#
|
||||
|
||||
import scrapy
|
||||
urls = []
|
||||
# A transcript record can be uniquely identified using it's company name + date.
|
||||
uniqueID = ""
|
||||
# Some transcript preambles are concatenated on a single line. This list is used
|
||||
# To separate the title and date sections of the string.
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||
|
||||
class SlidesSpider(scrapy.Spider):
|
||||
name = 'slides'
|
||||
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
|
||||
|
||||
def parse(self, response):
|
||||
# Follows each transcript page's link from the given index page.
|
||||
for href in response.css('.dashboard-article-link::attr(href)').extract():
|
||||
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
|
||||
|
||||
# Follows the pagination links at the bottom of given index page.
|
||||
next_page = response.css('li.next a::attr(href)').extract_first()
|
||||
if next_page is not None:
|
||||
next_page = response.urljoin(next_page)
|
||||
yield scrapy.Request(next_page, callback=self.parse)
|
||||
|
||||
def parse_transcript(self, response):
|
||||
slides = response.css('li#slides a::attr(href)').extract_first()
|
||||
if slides is not None:
|
||||
body = response.css('div#a-body p.p1')
|
||||
chunks = body.css('p.p1')
|
||||
i = 0
|
||||
while i < 3:
|
||||
# If we're on the first line of the preamble, that's the
|
||||
# company name, stock exchange and ticker acroynm (or should
|
||||
# be - see below)
|
||||
if i == 0:
|
||||
# Checks to see if the second line is a heading. If not,
|
||||
# everything is fine.
|
||||
if len(chunks[1].css('strong::text').extract()) == 0:
|
||||
uniqueID = chunks[i].css('p::text').extract_first()
|
||||
if " (" in uniqueID:
|
||||
uniqueID = uniqueID.split(' (')[0]
|
||||
i = 2
|
||||
# However, if it is, that means this line contains the
|
||||
# full, concatenated preamble, so everything must be
|
||||
# extracted here
|
||||
else:
|
||||
uniqueID = chunks[i].css('p::text').extract_first()
|
||||
if " (" in uniqueID:
|
||||
uniqueID = uniqueID.split(' (')[0]
|
||||
titleAndDate = chunks[i].css('p::text').extract[1]
|
||||
for date in months:
|
||||
if date in titleAndDate:
|
||||
splits = titleAndDate.split(date)
|
||||
uniqueID = uniqueID + ";" + date + splits[1]
|
||||
i = 3
|
||||
# Otherwise, we're onto the date line.
|
||||
elif i == 2:
|
||||
uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first()
|
||||
i += 1
|
||||
|
||||
slides = response.urljoin(slides)
|
||||
yield uniqueID
|
||||
#yield scrapy.Request(sides, callback=self.parse_slides)
|
||||
|
||||
def parse_slides(self, response):
|
||||
urls = response.css('figure img::attr(src)').extract()
|
||||
yield uniqueID + "\\\\" + ';'.join(urls)
|
||||
|
Reference in a new issue