Started on slides

2016-12-27 00:45:46 +00:00 · 2016-12-27 00:45:46 +00:00 · 75e3dc5790
commit 75e3dc5790
parent f71a545201
6 changed files with 86 additions and 6 deletions
--- a/scraping-alpha/Scraping_Alpha/README.md
+++ b/scraping-alpha/Scraping_Alpha/README.md
@ -79,3 +79,7 @@ It creates from this two files (`execs.sql` and `analysts.sql`). Import them
 into your DBMS to create two linking tables. The final instruction of 
 `analysts.sql` then deletes the superfluous `execs` and `analysts` columns from 
 the `transcripts` table (and for this reason, `execs.sql` must be imported first).
+
+### Future
+
+Harvesting the URLs of slide images shouldn't be too hard to implement - `slides_spider.py` should in theory to this, but the link to a transcript's slides is added to the page later via Javascript, which means at the moment it throws up a load of HTTP 200 status codes and nowt else. [Scrapy+Splash](https://github.com/scrapy-plugins/scrapy-splash) may be the solution, however.
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py
@ -0,0 +1,79 @@
+#
+#	Title: Scraping Alpha
+#	Version: 1.0
+#	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
+#
+#	This file is a part of Scraping Alpha, a series of scripts to scrape
+#	earnings call transcripts from seekingalpha.com and present them as useful
+#	SQL.
+#
+#	This file is the webspider that Scrapy uses to retrieve slides.
+#
+
+import scrapy
+urls = []
+# A transcript record can be uniquely identified using it's company name + date.
+uniqueID = ""
+# Some transcript preambles are concatenated on a single line. This list is used
+# To separate the title and date sections of the string.
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+
+class SlidesSpider(scrapy.Spider):
+    name = 'slides'
+    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
+	
+    def parse(self, response):
+        # Follows each transcript page's link from the given index page.
+        for href in response.css('.dashboard-article-link::attr(href)').extract():
+            yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
+            
+        # Follows the pagination links at the bottom of given index page.
+        next_page = response.css('li.next a::attr(href)').extract_first()
+        if next_page is not None:
+			next_page = response.urljoin(next_page)
+			yield scrapy.Request(next_page, callback=self.parse)
+    
+    def parse_transcript(self, response):
+        slides = response.css('li#slides a::attr(href)').extract_first()
+    	if slides is not None:
+			body = response.css('div#a-body p.p1')
+			chunks = body.css('p.p1')
+			i = 0
+			while i < 3:
+				# If we're on the first line of the preamble, that's the
+				# company name, stock exchange and ticker acroynm (or should
+				# be - see below)
+				if i == 0:
+					# Checks to see if the second line is a heading. If not,
+					# everything is fine.
+					if len(chunks[1].css('strong::text').extract()) == 0:
+						uniqueID = chunks[i].css('p::text').extract_first()
+						if " (" in uniqueID:
+							uniqueID = uniqueID.split(' (')[0]
+						i = 2
+					# However, if it is, that means this line contains the
+					# full, concatenated preamble, so everything must be 
+					# extracted here
+					else:
+						uniqueID = chunks[i].css('p::text').extract_first()
+						if " (" in uniqueID:
+							uniqueID = uniqueID.split(' (')[0]
+						titleAndDate = chunks[i].css('p::text').extract[1]
+						for date in months:
+							if date in titleAndDate:
+								splits = titleAndDate.split(date)
+								uniqueID = uniqueID + ";" + date + splits[1]
+						i = 3
+				# Otherwise, we're onto the date line.
+				elif i == 2:
+					uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first()
+				i += 1
+			
+			slides = response.urljoin(slides)
+			yield uniqueID
+			#yield scrapy.Request(sides, callback=self.parse_slides)
+			
+	def parse_slides(self, response):
+		urls = response.css('figure img::attr(src)').extract()
+		yield uniqueID + "\\\\" + ';'.join(urls)
+		
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py
@ -1,4 +1,5 @@
-#
+#-*- coding: utf-8 -*-
+
 #	Title: Scraping Alpha
 #	Version: 1.0
 #	Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
@ -24,10 +25,6 @@
 #	in the directory the Terminal is currently in).
 #

-# Some of the <Exec, Position> tuples are separate by an em- rather than an
-# en-dash, which isn't featured in the ASCII charset, hence the below line:
-#-*- coding: utf-8 -*-
-
 import scrapy
 # This enum lists the stages of each transcript.
 from enum import Enum
@ -112,7 +109,7 @@ class TranscriptSpider(scrapy.Spider):
 								if date in titleAndDate:
 									splits = titleAndDate.split(date)
 									details['title'] = splits[0]
-									details['date'] = dates + splits[1]
+									details['date'] = date + splits[1]
 					# Otherwise, we're onto the title line.
 					elif i == 1:
 						title = chunks[i].css('p::text').extract_first()
--- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
+++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc
--- a/scraping-alpha/Scraping_Alpha/urls.json
+++ b/scraping-alpha/Scraping_Alpha/urls.json