Started SeekingAlphaScraper

2016-12-06 08:18:48 +00:00 · 2016-12-06 08:18:48 +00:00 · f0998ecd98
commit f0998ecd98
parent 113951ba39
9 changed files with 258 additions and 11280 deletions
--- a/seeking-alpha-scraper/transcripts_spider.py
+++ b/seeking-alpha-scraper/transcripts_spider.py
@ -0,0 +1,73 @@
+import scrapy
+import re
+
+class TranscriptSpider(scrapy.Spider):
+    name = 'transcripts'
+
+    start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
+    def cleanhtml(raw_html):
+	cleanr = re.compile('<.*?>')
+	cleantext = re.sub(cleanr, '', raw_html)
+	return cleantext
+	
+    def parse(self, response):
+        # follow links to transcript pages
+        for href in response.css('.dashboard-article-link::attr(href)').extract():
+            yield scrapy.Request(response.urljoin(href),
+                                 callback=self.parse_transcript)
+
+        # follow pagination links
+        #next_page = response.css('li.next a::attr(href)').extract_first()
+        #if next_page is not None:
+        #    next_page = response.urljoin(next_page)
+        #    yield scrapy.Request(next_page, callback=self.parse)
+    
+    def parse_transcript(self, response):
+	i = 4
+        def extract_with_css(query):
+            return response.css(query).extract_first().strip()
+	
+	body = response.css('div#a-body p.p1')
+	chunks = body.css('p.p1')
+	firstline = chunks[0].css('p::text').extract()
+	ticker = chunks.css('a::text').extract_first()
+	if ":" in ticker: 
+	    ticker = ticker.split(':')[1]
+	
+	name = re.compile('([A-z -]* - [A-z ,&-]*)')
+	execs = []
+	analysts = []
+	
+	nextLine = chunks[i].css('p::text').extract_first()
+	while re.match(name, nextLine) is not None:
+	    execs.append(nextLine)
+	    i += i
+	    nextLine = chunks[i].css('p::text').extract_first()
+	print "DONE EXECS"
+	print i
+	print "Next line: "+nextLine
+	while re.match(name, nextLine) is not None:
+	    analysts.append(nextLine)
+	    i += i
+	    nextLine = chunks[i].css('p::text').extract_first()
+	print "DONE ANALYSTS"
+	
+	print execs
+	print "-----------"
+	print analysts
+	print "^^^^^^^^^"
+	
+	#### PLACEHOLDER
+	i = 0
+	while True:
+	    print i ,": " , chunks[i].css('p::text').extract_first()
+	    print i ,": " , chunks[i].css('strong::text').extract_first()
+	    i += 1
+	    
+        #yield {
+	#    'company': firstline[0].split(" (", 1)[0],
+         #   'stockmarket': firstline[0].split(" (", 1)[1],
+	 #   'ticker': ticker,
+	#    'title': chunks[1].css('p::text').extract_first(),
+	#    'date': chunks[2].css('p::text').extract_first()
+        #}