This commit is contained in:
Ben Goldsworthy 2016-12-27 00:01:35 +00:00
parent 57800c7114
commit 8ddd6a2d49
4 changed files with 697 additions and 0 deletions

View file

@ -0,0 +1,81 @@
# Scraping Alpha
### Author
Ben Goldsworthy
<[email](mailto:b.goldsworthy@lancaster.ac.uk)>
<[website](http://www.bengoldsworthy.uk/)>
### Version
1.0
### Abstract
Scraping Alpha is a series of Python scripts used to scrape
[Seeking Alpha](http://seekingalpha.com/) earnings call transcripts and produce
SQL from them.
It was created for Dr Lars Hass of the Lancaster University Management School.
### Usage
The instructions for each step of the process can be found at the beginning of
each of the files involved: `transcript_spider.py`, `JSONtoSQL.py` and
`execsAndAnalysts.py`. The are repeated here for brevity.
#### `transcript_spider.py`
This file is the webspider that Scrapy uses to retrieve the information from the
website. Left unattended, it will scrape all 4,000+ pages of results.
To interrupt this behaviour and still be able to proceed with the other steps,
cancel the script with `CTRL+Z`. This will likely leave an unfinished JSON item
at the end of the output file. To clear this up, open the file in `vim` and type
the following keys:
```vim
G
V
d
$
i
BACKSPACE
ENTER
]
ESC
:wp
ENTER
```
This will truncate the file at the last complete record and seal it off.
For installation instructions for Scrapy, see
[here](https://doc.scrapy.org/en/latest/intro/install.html). This file should be
in the `spiders` directory of the project, and is run via `scrapy crawl
transcripts -o transcripts.json` at the command line (the output file will be
placed in the directory the Terminal is currently pointing to).
#### `JSONtoSQL.py`
This file takes the `transcripts.json` file output of `transcript_spider.py` and
converts it into SQL.
This file should be located in the same directory as `transcripts.json`, and is
run via `python JSONtoSQL.py > [FILE].sql`, where `[FILE]` is the desired name
of the output file.
#### `execsAndAnalysts.py`
First, import the output file of `JSONtoSQL.py` to your chosen DBMS (I've tested
it with phpMyAdmin). Then, run the following query:
```SQL
SELECT `id`, `execs`, `analysts` FROM `transcripts`
```
Export the resulting table ([instructions](http://serverfault.com/a/435443)) to
`transcripts.sql`, and place the file in the same directory as
`execsAndAnalysts.py`. Run it with 'python execsAndAnalysts'.
It creates from this two files (`execs.sql` and `analysts.sql`). Import them
into your DBMS to create two linking tables. The final instruction of
`analysts.sql` then deletes the superfluous `execs` and `analysts` columns from
the `transcripts` table.

View file

@ -0,0 +1,401 @@
INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES
(1, 0),
(2, 1),
(3, 2),
(3, 3),
(3, 4),
(3, 5),
(3, 6),
(3, 7),
(3, 8),
(3, 9),
(3, 10),
(4, 11),
(4, 12),
(4, 13),
(4, 14),
(4, 15),
(4, 16),
(4, 17),
(4, 18),
(4, 19),
(4, 20),
(4, 21),
(4, 22),
(4, 23),
(4, 24),
(4, 25),
(5, 26),
(5, 27),
(5, 28),
(6, 29),
(6, 30),
(6, 31),
(6, 32),
(6, 33),
(6, 34),
(6, 35),
(6, 36),
(6, 37),
(6, 38),
(6, 39),
(7, 40),
(7, 41),
(7, 42),
(7, 43),
(7, 44),
(7, 45),
(7, 46),
(7, 47),
(7, 48),
(8, 49),
(8, 50),
(8, 51),
(8, 52),
(8, 53),
(8, 54),
(8, 55),
(8, 56),
(8, 57),
(8, 58),
(8, 59),
(8, 60),
(8, 61),
(8, 62),
(8, 63),
(8, 64),
(8, 65),
(8, 66),
(8, 67),
(8, 68),
(9, 69),
(9, 70),
(9, 71),
(10, 72),
(10, 73),
(10, 74),
(10, 75),
(10, 76),
(10, 77),
(10, 78),
(10, 79),
(10, 80),
(10, 81),
(11, 82),
(11, 83),
(11, 84),
(11, 85),
(11, 86),
(11, 87),
(11, 88),
(11, 89),
(11, 90),
(11, 91),
(11, 92),
(11, 93),
(11, 94),
(11, 95),
(12, 96),
(12, 97),
(12, 98),
(12, 81),
(12, 99),
(13, 100),
(13, 101),
(13, 102),
(13, 103),
(13, 104),
(13, 105),
(13, 106),
(14, 107),
(14, 108),
(14, 109),
(14, 110),
(14, 111),
(14, 112),
(14, 113),
(14, 114),
(14, 115),
(14, 116),
(14, 117),
(15, 13),
(15, 118),
(15, 119),
(15, 120),
(15, 23),
(15, 121),
(16, 122),
(16, 39),
(16, 123),
(16, 124),
(16, 125),
(16, 126),
(17, 127),
(17, 128),
(17, 129),
(17, 130),
(17, 131),
(18, 132),
(18, 133),
(19, 134),
(19, 135),
(19, 136),
(19, 137),
(19, 138),
(19, 139),
(19, 140),
(19, 141),
(19, 142),
(20, 141),
(20, 143),
(20, 144),
(20, 145),
(20, 146),
(20, 142),
(20, 147),
(20, 148),
(20, 149),
(20, 95),
(20, 150),
(20, 151),
(20, 152),
(20, 153),
(21, 154),
(21, 155),
(21, 156),
(21, 157),
(21, 158),
(21, 159),
(21, 160),
(21, 161),
(21, 162),
(21, 163),
(21, 164),
(21, 165),
(21, 166),
(21, 167),
(21, 168),
(21, 169),
(21, 170),
(22, 171),
(22, 172),
(22, 173),
(22, 174),
(23, 175),
(23, 176),
(23, 177),
(23, 178),
(23, 179),
(23, 180),
(23, 181),
(24, 182),
(24, 183),
(25, 184),
(25, 185),
(25, 186),
(25, 187),
(25, 188),
(25, 189),
(25, 190),
(25, 191),
(25, 192),
(25, 193),
(25, 194),
(25, 195),
(25, 196),
(25, 197),
(25, 198),
(26, 199),
(26, 200),
(26, 201),
(26, 202),
(26, 203),
(26, 204),
(26, 205),
(26, 206),
(26, 207),
(26, 208),
(26, 209),
(26, 210),
(26, 211),
(26, 212),
(26, 213),
(27, 214),
(27, 215),
(27, 216),
(27, 44),
(27, 217),
(27, 218),
(27, 41),
(27, 219),
(27, 45),
(27, 220),
(28, 71),
(28, 70),
(30, 221),
(30, 222),
(30, 131),
(31, 223),
(31, 224),
(31, 225),
(32, 226),
(32, 85),
(32, 227),
(32, 228),
(32, 229),
(32, 230),
(32, 231),
(32, 232),
(32, 112),
(32, 233),
(32, 234),
(32, 235),
(33, 236),
(33, 237),
(33, 238),
(34, 239),
(34, 240),
(34, 241),
(34, 242),
(34, 243),
(34, 244),
(35, 245),
(35, 246),
(36, 247),
(36, 248),
(37, 249),
(37, 250),
(38, 251),
(38, 252),
(38, 253),
(39, 254),
(39, 255),
(39, 256),
(39, 257),
(39, 258),
(39, 259),
(39, 260),
(39, 261),
(39, 262),
(40, 263),
(40, 264),
(40, 265),
(41, 266),
(41, 267),
(42, 268),
(42, 269),
(42, 270),
(42, 271),
(42, 272),
(43, 190),
(43, 273),
(43, 274),
(43, 275),
(43, 276),
(43, 277),
(43, 278),
(43, 279),
(43, 280),
(43, 186),
(44, 281),
(44, 164),
(44, 282),
(44, 154),
(44, 283),
(44, 284),
(44, 285),
(44, 286),
(44, 287),
(44, 288),
(44, 289),
(44, 290),
(44, 291),
(45, 292),
(45, 293),
(45, 294),
(46, 295),
(47, 165),
(47, 296),
(47, 166),
(47, 297),
(47, 298),
(47, 299),
(47, 160),
(47, 300),
(48, 301),
(48, 302),
(48, 303),
(48, 304),
(48, 305),
(48, 306),
(48, 307),
(48, 308),
(49, 309),
(49, 310),
(49, 311),
(50, 312),
(50, 313),
(51, 314),
(52, 315),
(52, 316),
(52, 317),
(52, 318),
(53, 319),
(53, 320),
(54, 321),
(54, 322),
(54, 323),
(54, 324),
(54, 325),
(54, 326),
(54, 327),
(54, 328),
(55, 329),
(55, 330),
(55, 331),
(55, 332),
(55, 333),
(56, 334),
(56, 335),
(57, 336),
(57, 337),
(57, 338),
(57, 339),
(57, 340),
(57, 341),
(57, 342),
(57, 343),
(58, 344),
(58, 345),
(58, 238),
(58, 346),
(58, 347),
(58, 348),
(58, 349),
(58, 350),
(58, 351),
(58, 352),
(58, 353),
(58, 237),
(58, 354),
(59, 355),
(59, 356),
(59, 357),
(59, 358),
(59, 359),
(60, 360),
(60, 361),
(61, 362),
(61, 363),
(62, 364),
(62, 365);
ALTER TABLE `transcripts`
DROP COLUMN `execs`,
DROP COLUMN `analysts`;
DELETE FROM `transcripts` WHERE `id` = 1;
DELETE FROM `execs` WHERE `id` = 1;
DELETE FROM `analysts` WHERE `id` = 1;
DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;
DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;

View file

@ -0,0 +1,208 @@
INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES
(1, 0),
(2, 1),
(2, 2),
(2, 3),
(2, 4),
(3, 5),
(3, 6),
(3, 7),
(4, 8),
(4, 9),
(4, 10),
(5, 11),
(5, 12),
(5, 13),
(5, 14),
(6, 15),
(6, 16),
(6, 17),
(6, 18),
(6, 19),
(7, 20),
(7, 21),
(7, 22),
(7, 23),
(8, 24),
(8, 25),
(8, 26),
(9, 27),
(9, 28),
(9, 29),
(9, 30),
(10, 31),
(10, 32),
(11, 33),
(11, 34),
(11, 35),
(12, 36),
(12, 37),
(12, 38),
(12, 39),
(13, 40),
(13, 41),
(13, 42),
(14, 43),
(14, 44),
(14, 45),
(14, 46),
(14, 47),
(14, 48),
(14, 49),
(14, 50),
(14, 51),
(15, 52),
(15, 53),
(15, 54),
(16, 55),
(16, 56),
(16, 57),
(17, 58),
(17, 59),
(17, 60),
(17, 61),
(18, 62),
(18, 63),
(19, 64),
(19, 65),
(19, 66),
(20, 67),
(20, 68),
(21, 69),
(21, 70),
(21, 71),
(22, 72),
(22, 73),
(22, 74),
(23, 75),
(23, 76),
(23, 77),
(23, 78),
(24, 79),
(24, 80),
(24, 81),
(25, 82),
(25, 83),
(25, 84),
(26, 85),
(26, 86),
(26, 87),
(26, 88),
(27, 89),
(27, 90),
(27, 91),
(27, 92),
(27, 93),
(28, 94),
(28, 95),
(28, 96),
(29, 97),
(29, 98),
(29, 99),
(30, 100),
(30, 101),
(31, 102),
(31, 103),
(31, 104),
(31, 105),
(32, 106),
(32, 107),
(33, 108),
(33, 109),
(34, 110),
(34, 111),
(35, 112),
(35, 113),
(35, 114),
(36, 115),
(36, 116),
(36, 117),
(36, 118),
(37, 119),
(37, 120),
(37, 121),
(38, 122),
(38, 123),
(39, 124),
(39, 125),
(39, 126),
(40, 127),
(40, 128),
(40, 129),
(41, 130),
(41, 131),
(41, 132),
(41, 133),
(42, 134),
(42, 135),
(42, 136),
(42, 137),
(42, 138),
(42, 139),
(43, 140),
(43, 141),
(43, 142),
(44, 143),
(44, 144),
(44, 145),
(45, 146),
(45, 147),
(45, 148),
(46, 149),
(46, 150),
(46, 151),
(47, 152),
(47, 153),
(47, 154),
(47, 155),
(48, 156),
(48, 157),
(48, 158),
(49, 159),
(49, 160),
(49, 161),
(50, 162),
(50, 163),
(51, 164),
(51, 165),
(52, 166),
(52, 167),
(52, 168),
(53, 169),
(53, 170),
(53, 171),
(54, 172),
(54, 173),
(54, 174),
(54, 175),
(54, 176),
(54, 177),
(55, 178),
(55, 179),
(55, 180),
(56, 181),
(56, 182),
(57, 183),
(57, 184),
(57, 185),
(58, 186),
(58, 187),
(58, 188),
(58, 189),
(58, 190),
(58, 191),
(58, 192),
(58, 193),
(58, 194),
(58, 195),
(59, 196),
(59, 197),
(59, 198),
(60, 199),
(60, 200),
(60, 201),
(61, 202),
(61, 203),
(62, 204),
(62, 205),
(62, 206);

View file

@ -102,3 +102,10 @@ with open("analysts.sql", 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
filehandle.write(";")
# `analysts.sql` then performs some cleanup on the database.
filehandle.write("\n\nALTER TABLE `transcripts`\n\tDROP COLUMN `execs`,\n\tDROP COLUMN `analysts`;\n\n")
filehandle.write("DELETE FROM `transcripts` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `execs` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `analysts` WHERE `id` = 0;\n")
filehandle.write("DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;\n")
filehandle.write("DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;\n")