This repository has been archived on 2022-08-01. You can view files and clone it, but cannot push or open issues or pull requests.
Scraping-Alpha/seeking-alpha-scraper/transcripts_spider.py

74 lines
2.2 KiB
Python

import scrapy
import re
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def parse(self, response):
# follow links to transcript pages
for href in response.css('.dashboard-article-link::attr(href)').extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_transcript)
# follow pagination links
#next_page = response.css('li.next a::attr(href)').extract_first()
#if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 4
def extract_with_css(query):
return response.css(query).extract_first().strip()
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
firstline = chunks[0].css('p::text').extract()
ticker = chunks.css('a::text').extract_first()
if ":" in ticker:
ticker = ticker.split(':')[1]
name = re.compile('([A-z -]* - [A-z ,&-]*)')
execs = []
analysts = []
nextLine = chunks[i].css('p::text').extract_first()
while re.match(name, nextLine) is not None:
execs.append(nextLine)
i += i
nextLine = chunks[i].css('p::text').extract_first()
print "DONE EXECS"
print i
print "Next line: "+nextLine
while re.match(name, nextLine) is not None:
analysts.append(nextLine)
i += i
nextLine = chunks[i].css('p::text').extract_first()
print "DONE ANALYSTS"
print execs
print "-----------"
print analysts
print "^^^^^^^^^"
#### PLACEHOLDER
i = 0
while True:
print i ,": " , chunks[i].css('p::text').extract_first()
print i ,": " , chunks[i].css('strong::text').extract_first()
i += 1
#yield {
# 'company': firstline[0].split(" (", 1)[0],
# 'stockmarket': firstline[0].split(" (", 1)[1],
# 'ticker': ticker,
# 'title': chunks[1].css('p::text').extract_first(),
# 'date': chunks[2].css('p::text').extract_first()
#}