1.0 fuck yes

This commit is contained in:
Ben Goldsworthy 2016-12-26 23:25:34 +00:00
parent f0998ecd98
commit 57800c7114
17 changed files with 761 additions and 73 deletions

View file

@ -0,0 +1,126 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file takes the `transcripts.json` file output of `transcript_spider.py`
# and converts it into SQL.
#
# This file should be located in the same directory as `transcripts.json`, and
# is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
# the output file.
#
import json
import sys
import codecs
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
json_data=open('transcripts.json').read()
data = json.loads(json_data)
executives = []
analysts = []
# For each transcript, creates new, separate arrays of executives and analysts
# for their own database tables, replacing their tuples in the transcript with
# their database keys.
for entry in data:
indexExec = len(executives)+1
indexAnal = len(analysts)+1
newExecs = []
for executive in entry['entry']['exec']:
if executive not in executives:
executives.append(executive)
newExecs.append(indexExec)
indexExec += 1
else:
newExecs.append(executives.index(executive) + 1)
entry['entry']['exec'] = newExecs
newAnals = []
for analyst in entry['entry']['analysts']:
if analyst not in analysts:
analysts.append(analyst)
newAnals.append(indexAnal)
indexAnal += 1
else:
newAnals.append(analysts.index(analyst) + 1)
entry['entry']['analysts'] = newAnals
# Outputs the SQL file that creates the various tables and populates them with
# INSERT statements.
print "CREATE TABLE IF NOT EXISTS `execs`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`position` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
print "\t(0,0,0)",
for executive in executives:
print ","
print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `analysts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `analysts` (`name`, `company`) VALUES"
print "\t(0,0)",
for analyst in analysts:
print ","
print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `transcripts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`title` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\t`execs` VARCHAR(255),"
print "\t`analysts` VARCHAR(255),"
print "\t`transcript` TEXT,"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
print "\t(0,0,0,0,0)",
for entry in data:
tran = entry['entry']
print ","
print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
print "("
print "\t`exec_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");\n"
print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
print "("
print "\t`analyst_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");"

View file

@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapingAlphaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View file

@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ScrapingAlphaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View file

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapingAlphaPipeline(object):
def process_item(self, item, spider):
return item

View file

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# Scrapy settings for Scraping_Alpha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Scraping_Alpha'
SPIDER_MODULES = ['Scraping_Alpha.spiders']
NEWSPIDER_MODULE = 'Scraping_Alpha.spiders'
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Scraping_Alpha (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Scraping_Alpha.middlewares.ScrapingAlphaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Scraping_Alpha.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'Scraping_Alpha.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View file

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View file

@ -0,0 +1,171 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file is the webspider that Scrapy uses to retrieve the information from
# the website. Left unattended, it will scrape all 4,000+ pages of results.
#
# To interrupt this behaviour and still be able to proceed with the other
# steps, cancel the script with CTRL+Z. This will likely leave an unfinished
# JSON item at the end of the output file. To clear this up, open the file
# in vim and type the following keys:
# 'G', 'V', 'd', '$', 'i', 'BACKSPACE', 'ENTER', ']', 'ESC', ':wp', 'ENTER'
# This will truncate the file at the last complete record and seal it off.
#
# For installation instructions for Scrapy, visit
# <doc.scrapy.org/en/latest/intro/install.html>. This file should be in the
# `spiders` directory of the project, and is run via 'scrapy crawl transcripts
# -o transcripts.json' at the command line (the output file will be placed
# in the directory the Terminal is currently in).
#
# Some of the <Exec, Position> tuples are separate by an em- rather than an
# en-dash, which isn't featured in the ASCII charset, hence the below line:
#-*- coding: utf-8 -*-
import scrapy
# This enum lists the stages of each transcript.
from enum import Enum
Stage = Enum('Stage', 'preamble execs analysts body')
# Some transcript preambles are concatenated on a single line. This list is used
# To separate the title and date sections of the string.
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
transcripts = {}
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def parse(self, response):
# Follows each transcript page's link from the given index page.
for href in response.css('.dashboard-article-link::attr(href)').extract():
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
# Follows the pagination links at the bottom of given index page.
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 0
transcript = {}
details = {}
execs = []
analysts = []
script = []
mode = 1
# As the pages are represented by a series of `<p>` elements, all with
# the same class `.p1` and no unique identfiers, we have to do this the
# old-fashioned way - breaking it into chunks and iterating over them.
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
while i < len(chunks):
# If the current line is a heading and we're not currently going
# through the transcript body (where headings represent speakers),
# change the current section flag to the next section.
if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
currStage = Stage(mode)
# If we're on the preamble stage, each bit of data is extracted
# separately as they all have their own key in the JSON.
if currStage == Stage['preamble']:
# If we're on the first line of the preamble, that's the
# company name, stock exchange and ticker acroynm (or should
# be - see below)
if i == 0:
# Checks to see if the second line is a heading. If not,
# everything is fine.
if len(chunks[1].css('strong::text').extract()) == 0:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# If a specific stock exchange is not listed, it
# defaults to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
# However, if it is, that means this line contains the
# full, concatenated preamble, so everything must be
# extracted here
else:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# if a specific stock exchange is not listed, default to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
titleAndDate = chunks[i].css('p::text').extract[1]
for date in months:
if date in titleAndDate:
splits = titleAndDate.split(date)
details['title'] = splits[0]
details['date'] = dates + splits[1]
# Otherwise, we're onto the title line.
elif i == 1:
title = chunks[i].css('p::text').extract_first()
# This should never be the case, but just to be careful
# I'm leaving it in.
if len(title) <= 0:
title = "NO TITLE"
details['title'] = title
# Or the date line.
elif i == 2:
details['date'] = chunks[i].css('p::text').extract_first()
# If we're onto the 'Executives' section, we create a list of
# all of their names, positions and company name (from the
# preamble).
elif currStage == Stage['execs']:
anExec = chunks[i].css('p::text').extract_first().split(" - ")
# This covers if the execs are separated with an em- rather
# than an en-dash (see above).
if len(anExec) <= 1:
anExec = chunks[i].css('p::text').extract_first().split(" ")
name = anExec[0]
if len(anExec) > 1:
position = anExec[1]
# Again, this should never be the case, as an Exec-less
# company would find it hard to get much done.
else:
position = ""
execs.append((name,position,details['company']))
# This does the same, but with the analysts (which never seem
# to be separated by em-dashes for some reason).
elif currStage == Stage['analysts']:
name = chunks[i].css('p::text').extract_first().split(" - ")[0]
company = chunks[i].css('p::text').extract_first().split(" - ")[1]
analysts.append((name,company))
# This strips the transcript body of everything except simple
# HTML, and stores that.
elif currStage == Stage['body']:
line = chunks[i].css('p::text').extract_first()
html = "p>"
if line is None:
line = chunks[i].css('strong::text').extract_first()
html = "h1>"
script.append("<"+html+line+"</"+html)
else:
mode += 1
i += 1
# Adds the various arrays to the dictionary for the transcript
details['exec'] = execs
details['analysts'] = analysts
details['transcript'] = ''.join(script)
# Adds this transcript to the dictionary of all scraped
# transcripts, and yield that for the output
transcript["entry"] = details
yield transcript

View file

@ -0,0 +1,104 @@
#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file takes the `transcripts.sql` file exported from the database
# created using the output file of `JSONtoSQL.py` after the following query:
# SELECT `id`, `execs`, `analysts` FROM `transcripts`
# It creates from this two `execs.sql` and `analysts.sql` for creating linking
# tables in the database.
#
# This file should be located in the same directory as `transcripts.sql`, and
# is run via 'python execsAndAnalysts'.
#
import sys
import codecs
import os
from shutil import copyfile
import fileinput
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
start = 0
# Creates a temporary copy in case something goes Pete Tong.
copyfile("transcripts.sql", "transcripts.sql.tmp")
# Trims everything from the export except for the INSERT statement.
for line in fileinput.FileInput("transcripts.sql.tmp",inplace=1):
if start == 0:
if "INSERT INTO `transcripts`" in line:
start = 1
print "INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES"
else:
if line == "\n":
start = 0
else:
print "\t"+line,
# Copies the produced file to create both the output files, then deletes the
# temporary file.
copyfile("transcripts.sql.tmp", "execs.sql")
copyfile("transcripts.sql.tmp", "analysts.sql")
os.remove("transcripts.sql.tmp")
# Converts each line "(x, '0;...;n')" in the file to n separate INSERTs, one
# for each executive.
start = 0
nL = ""
for line in fileinput.FileInput("execs.sql",inplace=1):
if start == 0:
start = 1
else:
bits = line.split(', ')
tID = bits[0].strip('\t').strip('(')
execs = bits[1].split(';')
newLines = ""
for execID in execs:
newLines = newLines + nL + "\t("+tID+", "+execID.strip('\'')+"),"
line = line.replace(line, newLines)
nL = "\n"
print line,
# Does the same for the analysts.
start = 0
nL = ""
for line in fileinput.FileInput("analysts.sql",inplace=1):
if start == 0:
start = 1
line = line.replace(line, "INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES\n")
else:
bits = line.split(', ')
tID = bits[0].strip('\t').strip('(')
# As it is possible for there to be no analysts in a call, this ignores
# blank results.
if "''" not in bits[2]:
analysts = bits[2].split(';')
newLines = ""
for analystID in analysts:
# This stops the final transcript from getting an additional,
# `analyst_id`-less INSERT
if analystID != '\n':
newLines = newLines + nL + "\t("+tID+", "+analystID.strip('\'').strip('\'),\n')+"),"
line = line.replace(line, newLines)
nL = "\n"
else:
line = ""
print line,
# Replace the final comma at the end of each file with a semicolon, to make it
# valid SQL
with open("execs.sql", 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
filehandle.write(";")
with open("analysts.sql", 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
filehandle.write(";")

View file

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = Scraping_Alpha.settings
[deploy]
#url = http://localhost:6800/
project = Scraping_Alpha

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,110 @@
-- phpMyAdmin SQL Dump
-- version 4.0.10.14
-- http://www.phpmyadmin.net
--
-- Host: localhost:3306
-- Generation Time: Dec 26, 2016 at 02:39 PM
-- Server version: 5.5.52-cll-lve
-- PHP Version: 5.6.20
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
--
-- Database: `bengoldsworthy`
--
-- --------------------------------------------------------
--
-- Table structure for table `transcripts`
--
CREATE TABLE IF NOT EXISTS `transcripts` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`company` varchar(255) DEFAULT NULL,
`execs` varchar(255) DEFAULT NULL,
`analysts` varchar(255) DEFAULT NULL,
`transcript` text,
PRIMARY KEY (`id`),
UNIQUE KEY `id` (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=63 ;
--
-- Dumping data for table `transcripts`
--
INSERT INTO `transcripts` (`id`, `execs`, `analysts`) VALUES
(1, '0', '0'),
(2, '1;2;3;4', '1'),
(3, '5;6;7', '2;3;4;5;6;7;8;9;10'),
(4, '8;9;10', '11;12;13;14;15;16;17;18;19;20;21;22;23;24;25'),
(5, '11;12;13;14', '26;27;28'),
(6, '15;16;17;18;19', '29;30;31;32;33;34;35;36;37;38;39'),
(7, '20;21;22;23', '40;41;42;43;44;45;46;47;48'),
(8, '24;25;26', '49;50;51;52;53;54;55;56;57;58;59;60;61;62;63;64;65;66;67;68'),
(9, '27;28;29;30', '69;70;71'),
(10, '31;32', '72;73;74;75;76;77;78;79;80;81'),
(11, '33;34;35', '82;83;84;85;86;87;88;89;90;91;92;93;94;95'),
(12, '36;37;38;39', '96;97;98;81;99'),
(13, '40;41;42', '100;101;102;103;104;105;106'),
(14, '43;44;45;46;47;48;49;50;51', '107;108;109;110;111;112;113;114;115;116;117'),
(15, '52;53;54', '13;118;119;120;23;121'),
(16, '55;56;57', '122;39;123;124;125;126'),
(17, '58;59;60;61', '127;128;129;130;131'),
(18, '62;63', '132;133'),
(19, '64;65;66', '134;135;136;137;138;139;140;141;142'),
(20, '67;68', '141;143;144;145;146;142;147;148;149;95;150;151;152;153'),
(21, '69;70;71', '154;155;156;157;158;159;160;161;162;163;164;165;166;167;168;169;170'),
(22, '72;73;74', '171;172;173;174'),
(23, '75;76;77;78', '175;176;177;178;179;180;181'),
(24, '79;80;81', '182;183'),
(25, '82;83;84', '184;185;186;187;188;189;190;191;192;193;194;195;196;197;198'),
(26, '85;86;87;88', '199;200;201;202;203;204;205;206;207;208;209;210;211;212;213'),
(27, '89;90;91;92;93', '214;215;216;44;217;218;41;219;45;220'),
(28, '94;95;96', '71;70'),
(29, '97;98;99', ''),
(30, '100;101', '221;222;131'),
(31, '102;103;104;105', '223;224;225'),
(32, '106;107', '226;85;227;228;229;230;231;232;112;233;234;235'),
(33, '108;109', '236;237;238'),
(34, '110;111', '239;240;241;242;243;244'),
(35, '112;113;114', '245;246'),
(36, '115;116;117;118', '247;248'),
(37, '119;120;121', '249;250'),
(38, '122;123', '251;252;253'),
(39, '124;125;126', '254;255;256;257;258;259;260;261;262'),
(40, '127;128;129', '263;264;265'),
(41, '130;131;132;133', '266;267'),
(42, '134;135;136;137;138;139', '268;269;270;271;272'),
(43, '140;141;142', '190;273;274;275;276;277;278;279;280;186'),
(44, '143;144;145', '281;164;282;154;283;284;285;286;287;288;289;290;291'),
(45, '146;147;148', '292;293;294'),
(46, '149;150;151', '295'),
(47, '152;153;154;155', '165;296;166;297;298;299;160;300'),
(48, '156;157;158', '301;302;303;304;305;306;307;308'),
(49, '159;160;161', '309;310;311'),
(50, '162;163', '312;313'),
(51, '164;165', '314'),
(52, '166;167;168', '315;316;317;318'),
(53, '169;170;171', '319;320'),
(54, '172;173;174;175;176;177', '321;322;323;324;325;326;327;328'),
(55, '178;179;180', '329;330;331;332;333'),
(56, '181;182', '334;335'),
(57, '183;184;185', '336;337;338;339;340;341;342;343'),
(58, '186;187;188;189;190;191;192;193;194;195', '344;345;238;346;347;348;349;350;351;352;353;237;354'),
(59, '196;197;198', '355;356;357;358;359'),
(60, '199;200;201', '360;361'),
(61, '202;203', '362;363'),
(62, '204;205;206', '364;365');
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

View file

@ -1,73 +0,0 @@
import scrapy
import re
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def parse(self, response):
# follow links to transcript pages
for href in response.css('.dashboard-article-link::attr(href)').extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_transcript)
# follow pagination links
#next_page = response.css('li.next a::attr(href)').extract_first()
#if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 4
def extract_with_css(query):
return response.css(query).extract_first().strip()
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
firstline = chunks[0].css('p::text').extract()
ticker = chunks.css('a::text').extract_first()
if ":" in ticker:
ticker = ticker.split(':')[1]
name = re.compile('([A-z -]* - [A-z ,&-]*)')
execs = []
analysts = []
nextLine = chunks[i].css('p::text').extract_first()
while re.match(name, nextLine) is not None:
execs.append(nextLine)
i += i
nextLine = chunks[i].css('p::text').extract_first()
print "DONE EXECS"
print i
print "Next line: "+nextLine
while re.match(name, nextLine) is not None:
analysts.append(nextLine)
i += i
nextLine = chunks[i].css('p::text').extract_first()
print "DONE ANALYSTS"
print execs
print "-----------"
print analysts
print "^^^^^^^^^"
#### PLACEHOLDER
i = 0
while True:
print i ,": " , chunks[i].css('p::text').extract_first()
print i ,": " , chunks[i].css('strong::text').extract_first()
i += 1
#yield {
# 'company': firstline[0].split(" (", 1)[0],
# 'stockmarket': firstline[0].split(" (", 1)[1],
# 'ticker': ticker,
# 'title': chunks[1].css('p::text').extract_first(),
# 'date': chunks[2].css('p::text').extract_first()
#}