From 75e3dc5790d6ad0db29a1a1c3a387333df8b103b Mon Sep 17 00:00:00 2001 From: Ben Goldsworthy Date: Tue, 27 Dec 2016 00:45:46 +0000 Subject: [PATCH] Started on slides --- .../Scraping_Alpha/{README => README.md} | 4 + .../Scraping_Alpha/spiders/slides_spider.py | 79 ++++++++++++++++++ .../Scraping_Alpha/spiders/slides_spider.pyc | Bin 0 -> 2715 bytes .../spiders/transcript_spider.py | 9 +- .../spiders/transcript_spider.pyc | Bin 4460 -> 3742 bytes scraping-alpha/Scraping_Alpha/urls.json | 0 6 files changed, 86 insertions(+), 6 deletions(-) rename scraping-alpha/Scraping_Alpha/{README => README.md} (86%) create mode 100644 scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py create mode 100644 scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc create mode 100644 scraping-alpha/Scraping_Alpha/urls.json diff --git a/scraping-alpha/Scraping_Alpha/README b/scraping-alpha/Scraping_Alpha/README.md similarity index 86% rename from scraping-alpha/Scraping_Alpha/README rename to scraping-alpha/Scraping_Alpha/README.md index 31204e5..034a13f 100644 --- a/scraping-alpha/Scraping_Alpha/README +++ b/scraping-alpha/Scraping_Alpha/README.md @@ -79,3 +79,7 @@ It creates from this two files (`execs.sql` and `analysts.sql`). Import them into your DBMS to create two linking tables. The final instruction of `analysts.sql` then deletes the superfluous `execs` and `analysts` columns from the `transcripts` table (and for this reason, `execs.sql` must be imported first). + +### Future + +Harvesting the URLs of slide images shouldn't be too hard to implement - `slides_spider.py` should in theory to this, but the link to a transcript's slides is added to the page later via Javascript, which means at the moment it throws up a load of HTTP 200 status codes and nowt else. [Scrapy+Splash](https://github.com/scrapy-plugins/scrapy-splash) may be the solution, however. diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py new file mode 100644 index 0000000..29c22e8 --- /dev/null +++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.py @@ -0,0 +1,79 @@ +# +# Title: Scraping Alpha +# Version: 1.0 +# Author: Ben Goldsworthy +# +# This file is a part of Scraping Alpha, a series of scripts to scrape +# earnings call transcripts from seekingalpha.com and present them as useful +# SQL. +# +# This file is the webspider that Scrapy uses to retrieve slides. +# + +import scrapy +urls = [] +# A transcript record can be uniquely identified using it's company name + date. +uniqueID = "" +# Some transcript preambles are concatenated on a single line. This list is used +# To separate the title and date sections of the string. +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + +class SlidesSpider(scrapy.Spider): + name = 'slides' + start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1'] + + def parse(self, response): + # Follows each transcript page's link from the given index page. + for href in response.css('.dashboard-article-link::attr(href)').extract(): + yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript) + + # Follows the pagination links at the bottom of given index page. + next_page = response.css('li.next a::attr(href)').extract_first() + if next_page is not None: + next_page = response.urljoin(next_page) + yield scrapy.Request(next_page, callback=self.parse) + + def parse_transcript(self, response): + slides = response.css('li#slides a::attr(href)').extract_first() + if slides is not None: + body = response.css('div#a-body p.p1') + chunks = body.css('p.p1') + i = 0 + while i < 3: + # If we're on the first line of the preamble, that's the + # company name, stock exchange and ticker acroynm (or should + # be - see below) + if i == 0: + # Checks to see if the second line is a heading. If not, + # everything is fine. + if len(chunks[1].css('strong::text').extract()) == 0: + uniqueID = chunks[i].css('p::text').extract_first() + if " (" in uniqueID: + uniqueID = uniqueID.split(' (')[0] + i = 2 + # However, if it is, that means this line contains the + # full, concatenated preamble, so everything must be + # extracted here + else: + uniqueID = chunks[i].css('p::text').extract_first() + if " (" in uniqueID: + uniqueID = uniqueID.split(' (')[0] + titleAndDate = chunks[i].css('p::text').extract[1] + for date in months: + if date in titleAndDate: + splits = titleAndDate.split(date) + uniqueID = uniqueID + ";" + date + splits[1] + i = 3 + # Otherwise, we're onto the date line. + elif i == 2: + uniqueID = uniqueID + ";" + chunks[i].css('p::text').extract_first() + i += 1 + + slides = response.urljoin(slides) + yield uniqueID + #yield scrapy.Request(sides, callback=self.parse_slides) + + def parse_slides(self, response): + urls = response.css('figure img::attr(src)').extract() + yield uniqueID + "\\\\" + ';'.join(urls) + diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/slides_spider.pyc new file mode 100644 index 0000000000000000000000000000000000000000..863f619c4e09d8050a6f6c3017922891f1a26cb9 GIT binary patch literal 2715 zcmd5;&2Ah;5U!s6vDaDK0bv_QL>SpdW2E&GA%SE9!jXj_B1RfZ4p@ms<0gtxrMx9e9Gg08hXbj@)?@9ss_o-Z+r>z|E|-tE#)Jy6flb+O7RLJJa|j`YNW$ zrw#rIh6zCud=+Ix2ShzkMTtiT9wjwO>XbAnX;Lyp$uuP`N@ggzK}nmESxV-p*QNug z&66fNuG0~0Z_+nJPl&!gfTf~6{}tSk{)Nx3g+#UnazDz2f2t3KKZr`<+lAu~qI0KL zIlcQ;=JuZr`IU@nDD9A#O@ zmQilvGA*p>-ibXNz=t+4*g5kP$k|6UR&*@2cmhFj4tE~EO&11M9a>`L@cHNS-&DN}5W zox%hvWrPw+cH=TC&Sg2!CzUo<OPw6o982#?YdCY;yZQLZf~ zSM*)|c-`piNL($oDMq=`qRkOl_KY14i>Rl4Oh|}1h1BhjhPqo;!$Ox;)!iAz)llcC zWL6DL7iXd9alaE8lNt+LcTwr6017&Si|*cJ8g4U8UYr`isOdUzg>Jf8oCmMt>SYYW zcqZ=+wN_hDZ8fh1KQmb4n7Bgb4mM~u$qu(LgCAfF8uJC<`yuii$U5*j%a$cUYCt`7BPq3KC+>}=Ql$mAi4+Umy%;P-_(*?N_W}E|$mH?(> z!5nhAw67!r_GMRE!cp3*O1+W}al5n*Q^p&H?|bwpxDDp>MFGF_fPXUNaq`o5>--dg zThG?f6^zer%6qQz6b<9^9UlvHjc-nyuMSEMb&CJeoM{D{%zIr z+G{r|*3qm>8#0ol*NO)&A<(-1{?Bip zlVTV)O(de_wQfj=!K+va-ej`EgwrOKb^mt)w*3DDA_ckYO7JneCIPio(`(k=@%{#s CHE5#% literal 0 HcmV?d00001 diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py index ff59738..2767934 100644 --- a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py +++ b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.py @@ -1,4 +1,5 @@ -# +#-*- coding: utf-8 -*- + # Title: Scraping Alpha # Version: 1.0 # Author: Ben Goldsworthy @@ -24,10 +25,6 @@ # in the directory the Terminal is currently in). # -# Some of the tuples are separate by an em- rather than an -# en-dash, which isn't featured in the ASCII charset, hence the below line: -#-*- coding: utf-8 -*- - import scrapy # This enum lists the stages of each transcript. from enum import Enum @@ -112,7 +109,7 @@ class TranscriptSpider(scrapy.Spider): if date in titleAndDate: splits = titleAndDate.split(date) details['title'] = splits[0] - details['date'] = dates + splits[1] + details['date'] = date + splits[1] # Otherwise, we're onto the title line. elif i == 1: title = chunks[i].css('p::text').extract_first() diff --git a/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc b/scraping-alpha/Scraping_Alpha/Scraping_Alpha/spiders/transcript_spider.pyc index 09eb89eef2c6fa3130157d2f2f0747dda6cc464d..78fa5df800e3d08e2d61431c8486544b46bdd9c6 100644 GIT binary patch literal 3742 zcmd56sBmCZ5I|GzBFJ$`s5{FiSy&_Di&rqY&c2JV%=7AWwUWf_eIg=mF7tJJ_#iyS{+K!{70@ zRaYQw4$ZB&-?bAUwynRf%|%d2s{QV6q-**}w@l5C{pi?OQ`_wY$JPbIn|>_)>v~uE z+kPtj%_Qq@`y;-7#`k7F%GUe)(r@cT`gdAZ`mNsk(r@aPtw8>r)Q?Ro4HLVagn>>^ z+{jwLhOfDX28QmyO0@8qqJ1R}bJ%G3172ieg1qfI+i9n{IT38k2F7h`ll4Z!Xx$0p zeLqUt{#vWoZD>D@F=0-K)s`Pct7o=1uD2WrV9D$}*3f){=I9j~C_0b`KLDXP0Gs!( z($FF6(19zG<-nlI3$ilHx6BF@TMK;C-tGBmu!?Aftw^s%Vcc0?_pMFq?Nskwvr}vd4Bhox9kYl* z6t2bk$kzO`Z1n?8(YNqxjJ4;pdvlVh3o^kH8cD|A1N4tilM;Z6M1=;o<2 zZTVD2b1zJ>h5fqKi?twA__HALywTB~sG4e%^kSohn=P=c87tgOfaN*{B#rq1RHNPN z>PFh{COYl+8=JjWzpG^G_ODLh|Hb?ttC#20w zi~H=*aB2*-3=9Pt6a;yAJRqS+M_=RsGplG&nC8IaIjZv7RTpBiJ?U3YCK!4>R&e@>f+pjmMdkr?B^uzm z7H9*nTSiqoc}Q2I0j>vDGWDt>R>n8f!M+l0Kq%W%7F$jP&OqR-u*^VR2>jAvehgUP ztXNc{0p3zYB941PwFvC%X&yZF|KB>SbV8jGOEVEzcXydTy;B)WrA;Sv`jD_Evznv?Q z7*+&;l~MdUiqc;kWG2kBt`!LoP8kVX5MdDfSe)?r4|9j#=SEmOmtpb8$Ha+ay(o6g zi6V2N2y})bpZ>$1rO_T%a!%Yaod2_c9Hrm5yvEL5pcrX-nLrFC=wOlt^E3c0z^?Ae z1y!d?pU;%O{TMq>l;$wt+(&!kl#4YuM}z$M;B&Mm>E+Cc)$ByL?U9}xE>duThD$U6 zfmAa^=~ri2vv(qkC^(*CR(8!K9~oSv!4fTF>vC`jJ!k-l507yeo)U*%=Eq9F@-m*b z!?%#=062K(X#gxQ)Op<3WjeqsHp#x{#miVks>F*^1Le|P1>Lsm&)~t_!o!GSh6@5S zizW!)U-4JDLa8Nd$#uhtLI{3xjZJ&;{`$H_Y6y8n(#N>WF`UAZY*lgAcQc zybIhs#q%sC^j=_+{S#>GhmrBFg7!EQk1GUJb|RzO3$(|XQ5IV5e%vvV#9RF|m4eiJ zmRC85;?J9LunBK^SH^opRpUK39{cevwx1P@{jSae-7}#Ld$D(!xp)bc&!}p!-RS>Q zCoDm=m+&>8p&^w&`#&*@dSNklU6tHQuH;@(msCYP?G{xP{eqfN=O$#vIh~nP7gSm5 zj2U&&DXFSEuL`ORY9&`u&th~%71g{d=kn^3TTxf>zl6SubHNSX6-b-G8AY{_E2=uz zcjD1zlbzda3eK8Mgs~rC=y6RgbEu$F+eFc4r1pO9ZtxE}!?(WVRMf2dyvjR& E0Fsgy_W%F@ literal 4460 zcmd5s0A88VB9E`3p8jH;ArHJT7x+8R^pMz zN$V-4e^fRO%BS1buKR}-X^f3sEJf**xBPlhF3IqrcOUJ$0-Pzfh z-^|X=xnC_8M<4m`wrTRWg!gOsnLnWk@Fc6jk0M>sXS!`Fe(Xo*#+u5}AUL-! z7~b$>dH+Tq$@@(|mG@qfz2EdFe8Y_IjbW6{4^QOX)QP;`YFl|94BnUbzHZx9$bXdj zv1zAaVw*`A=yV1tYyCQY=I3Z&Xa?Y+!{(g;Pax;8(LfF;G6CdrLgdM7R@qLw$jxjA zV;bmptW9?7b)$7Jj8FV1>G<32LBFp3G{%6L5w*4-MYT)I>pN}MieX_me~q81qOouT ze4yxwqOp>$LnDl!g;!h}gK*#!UU^_xv&Q!QUp>G&A{n#}s8ri|yr4@GMp{rZ!=r3| zf#>}XS{>VuY@WFy?Z>GtOi8QHEbBtjm=8hKJA=Nir^9}t)8VkbKWGp8I<}@B4g02^ zY6FYgotke%cVOGVIQhofQW$$pX9Qo~@*+eG_w_`Y@;$LQ~va`gkZDYjMU`c%(3P-MRqWGO`mHt>ZAo6484QYdPk^FQ&-2(b2J}3fCut z*k~DL3&dZna4SJ#cxuAPdmi?!Gd7%G$+@Y1=5DGbM3KbMGZ^C-VlSCjLE!`{;y-^i5iU18@;X%0>7`~7HKaIprP zG|Epm-lSu>d@k+yY_=m{`(9+n*D1J3<1HG2z(urx6MdOA%M;mc1&&+H%FxX7mceZr zZP6B%ZUxWb4H|*Oo5#={sl=gIkjk=Pc?F5@G{iNCv;uT0ICOcTZs5F@sSEXFGoE}N zlekE6_1yvG(%TBUHLILP50ExdH*%HJ=7xWKU2`>Q%4j$>LP5!E*&U4>fi%~YD)T^! z6^|Lc=g`P4WGt$?tO(_7jjJs^m#)V<%4PFYmn&R|PVdnVLfdKKg1(0_!?u$$TWG7l z->ZFCsXe}5sa1r0=^i>SniA%M@cp}fjcevgvYqT0zJ-P0dkgjt;*;H7uFyiBkvn)= zcpEPHl5BQX{`v8_Y+b2l7y5h6hq6eYwL5-{(w!HfwqTJ`JfJS=>D02jb_H}BT9lBy zTi{!57C;=lReAK=N56X*vQfe$HNSDug-ueHR{r_NPb(h7A^S}~2SgB?Ct4*BWKC!1 zfo#~2!A4z}SfnlT`dQVTZ53O1o$D2EjT^qzB~EboQ7-A&yTK!Dz@^AZqA;_sKZtF| zh$OK`jP{d6$AMSD8rhrcq$(E$-W_f@xSM#kG=2L7;^mR|V&5E=bJ zpgoRB>BW9ud%u{nwL8POXR_LMn5Oa(<<)qW-7JXj#lb%C_FkT@F>HeI>eId-KV*d1 zAIZRkHXOv>eV$}jp{|_NWT=|uU|gJ&pZJJ)x1rxH{LEj`kjh{FcS>#vMOn$+%W3)EXtRJ_bkTbG)eGieB65&nH(ZcOWrOT hiSX>(TEsnF7K*zXnS&m1Vv