diff --git a/scraping-alpha/Scraping_Alpha/README b/scraping-alpha/Scraping_Alpha/README.md similarity index 86% rename from scraping-alpha/Scraping_Alpha/README rename to scraping-alpha/Scraping_Alpha/README.md index 31204e5..034a13f 100644 --- a/scraping-alpha/Scraping_Alpha/README +++ b/scraping-alpha/Scraping_Alpha/README.md @@ -79,3 +79,7 @@ It creates from this two files (`execs.sql` and `analysts.sql`). Import them into your DBMS to create two linking tables. The final instruction of `analysts.sql` then deletes the superfluous `execs` and `analysts` columns from the `transcripts` table (and for this reason, `execs.sql` must be imported first). + +### Future + +Harvesting the URLs of slide images shouldn't be too hard to implement - `slides_spider.py` should in theory to this, but the link to a transcript's slides is added to the page later via Javascript, which means at the moment it throws up a load of HTTP 200 status codes and nowt else. [Scrapy+Splash](https://github.com/scrapy-plugins/scrapy-splash) may be the solution, however. diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py new file mode 100644 index 0000000..29c22e8 --- /dev/null +++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py @@ -0,0 +1,79 @@ +# +# Title: Scraping Alpha +# Version: 1.0 +# Author: Ben Goldsworthy +# +# This file is a part of Scraping Alpha, a series of scripts to scrape +# earnings call transcripts from seekingalpha.com and present them as useful +# SQL. +# +# This file is the webspider that Scrapy uses to retrieve slides. +# + +import scrapy +urls = [] +# A transcript record can be uniquely identified using it's company name + date. +uniqueID = "" +# Some transcript preambles are concatenated on a single line. This list is used +# To separate the title and date sections of the string. +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + +class SlidesSpider(scrapy.Spider): + name = 'slides' + start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1'] + + def parse(self, response): + # Follows each transcript page's link from the given index page. + for href in response.css('.dashboard-article-link::attr(href)').extract(): + yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript) + + # Follows the pagination links at the bottom of given index page. + next_page = response.css('li.next a::attr(href)').extract_first() + if next_page is not None: + next_page = response.urljoin(next_page) + yield scrapy.Request(next_page, callback=self.parse) + + def parse_transcript(self, response): + slides = response.css('li#slides a::attr(href)').extract_first() + if slides is not None: + body = response.css('div#a-body p.p1') + chunks = body.css('p.p1') + i = 0 + while i < 3: + # If we're on the first line of the preamble, that's the + # company name, stock exchange and ticker acroynm (or should + # be - see below) + if i == 0: + # Checks to see if the second line is a heading. If not, + # everything is fine. + if len(chunks[1].css('strong::text').extract()) == 0: + uniqueID = chunks[i].css('p::text').extract_first() + if " (" in uniqueID: + uniqueID = uniqueID.split(' (')[0] + i = 2 + # However, if it is, that means this line contains the + # full, concatenated preamble, so everything must be + # extracted here + else: + uniqueID = chunks[i].css('p::text').extract_first() + if " (" in uniqueID: + uniqueID = uniqueID.split(' (')[0] + titleAndDate = chunks[i].css('p::text').extract[1] + for date in months: + if date in titleAndDate: + splits = titleAndDate.split(date) + uniqueID = uniqueID + ";" + date + splits[1] + i = 3 + # Otherwise, we're onto the date line. + elif i == 2: + uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first() + i += 1 + + slides = response.urljoin(slides) + yield uniqueID + #yield scrapy.Request(sides, callback=self.parse_slides) + + def parse_slides(self, response): + urls = response.css('figure img::attr(src)').extract() + yield uniqueID + "\\\\" + ';'.join(urls) + diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc new file mode 100644 index 0000000..863f619 Binary files /dev/null and b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc differ diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py index ff59738..2767934 100644 --- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py +++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py @@ -1,4 +1,5 @@ -# +#-*- coding: utf-8 -*- + # Title: Scraping Alpha # Version: 1.0 # Author: Ben Goldsworthy @@ -24,10 +25,6 @@ # in the directory the Terminal is currently in). # -# Some of the tuples are separate by an em- rather than an -# en-dash, which isn't featured in the ASCII charset, hence the below line: -#-*- coding: utf-8 -*- - import scrapy # This enum lists the stages of each transcript. from enum import Enum @@ -112,7 +109,7 @@ class TranscriptSpider(scrapy.Spider): if date in titleAndDate: splits = titleAndDate.split(date) details['title'] = splits[0] - details['date'] = dates + splits[1] + details['date'] = date + splits[1] # Otherwise, we're onto the title line. elif i == 1: title = chunks[i].css('p::text').extract_first() diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc index 09eb89e..78fa5df 100644 Binary files a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc and b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc differ diff --git a/scraping-alpha/Scraping_Alpha/urls.json b/scraping-alpha/Scraping_Alpha/urls.json new file mode 100644 index 0000000..e69de29