Bit more
This commit is contained in:
parent
57800c7114
commit
8ddd6a2d49
4 changed files with 697 additions and 0 deletions
81
scraping-alpha/Scraping_Alpha/README
Normal file
81
scraping-alpha/Scraping_Alpha/README
Normal file
|
@ -0,0 +1,81 @@
|
|||
# Scraping Alpha
|
||||
|
||||
### Author
|
||||
|
||||
Ben Goldsworthy
|
||||
<[email](mailto:b.goldsworthy@lancaster.ac.uk)>
|
||||
<[website](http://www.bengoldsworthy.uk/)>
|
||||
|
||||
### Version
|
||||
|
||||
1.0
|
||||
|
||||
### Abstract
|
||||
|
||||
Scraping Alpha is a series of Python scripts used to scrape
|
||||
[Seeking Alpha](http://seekingalpha.com/) earnings call transcripts and produce
|
||||
SQL from them.
|
||||
|
||||
It was created for Dr Lars Hass of the Lancaster University Management School.
|
||||
|
||||
### Usage
|
||||
|
||||
The instructions for each step of the process can be found at the beginning of
|
||||
each of the files involved: `transcript_spider.py`, `JSONtoSQL.py` and
|
||||
`execsAndAnalysts.py`. The are repeated here for brevity.
|
||||
|
||||
#### `transcript_spider.py`
|
||||
|
||||
This file is the webspider that Scrapy uses to retrieve the information from the
|
||||
website. Left unattended, it will scrape all 4,000+ pages of results.
|
||||
To interrupt this behaviour and still be able to proceed with the other steps,
|
||||
cancel the script with `CTRL+Z`. This will likely leave an unfinished JSON item
|
||||
at the end of the output file. To clear this up, open the file in `vim` and type
|
||||
the following keys:
|
||||
```vim
|
||||
G
|
||||
V
|
||||
d
|
||||
$
|
||||
i
|
||||
BACKSPACE
|
||||
ENTER
|
||||
]
|
||||
ESC
|
||||
:wp
|
||||
ENTER
|
||||
```
|
||||
|
||||
This will truncate the file at the last complete record and seal it off.
|
||||
|
||||
For installation instructions for Scrapy, see
|
||||
[here](https://doc.scrapy.org/en/latest/intro/install.html). This file should be
|
||||
in the `spiders` directory of the project, and is run via `scrapy crawl
|
||||
transcripts -o transcripts.json` at the command line (the output file will be
|
||||
placed in the directory the Terminal is currently pointing to).
|
||||
|
||||
#### `JSONtoSQL.py`
|
||||
|
||||
This file takes the `transcripts.json` file output of `transcript_spider.py` and
|
||||
converts it into SQL.
|
||||
|
||||
This file should be located in the same directory as `transcripts.json`, and is
|
||||
run via `python JSONtoSQL.py > [FILE].sql`, where `[FILE]` is the desired name
|
||||
of the output file.
|
||||
|
||||
#### `execsAndAnalysts.py`
|
||||
|
||||
First, import the output file of `JSONtoSQL.py` to your chosen DBMS (I've tested
|
||||
it with phpMyAdmin). Then, run the following query:
|
||||
```SQL
|
||||
SELECT `id`, `execs`, `analysts` FROM `transcripts`
|
||||
```
|
||||
|
||||
Export the resulting table ([instructions](http://serverfault.com/a/435443)) to
|
||||
`transcripts.sql`, and place the file in the same directory as
|
||||
`execsAndAnalysts.py`. Run it with 'python execsAndAnalysts'.
|
||||
|
||||
It creates from this two files (`execs.sql` and `analysts.sql`). Import them
|
||||
into your DBMS to create two linking tables. The final instruction of
|
||||
`analysts.sql` then deletes the superfluous `execs` and `analysts` columns from
|
||||
the `transcripts` table.
|
401
scraping-alpha/Scraping_Alpha/analysts.sql
Normal file
401
scraping-alpha/Scraping_Alpha/analysts.sql
Normal file
|
@ -0,0 +1,401 @@
|
|||
INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES
|
||||
(1, 0),
|
||||
(2, 1),
|
||||
(3, 2),
|
||||
(3, 3),
|
||||
(3, 4),
|
||||
(3, 5),
|
||||
(3, 6),
|
||||
(3, 7),
|
||||
(3, 8),
|
||||
(3, 9),
|
||||
(3, 10),
|
||||
(4, 11),
|
||||
(4, 12),
|
||||
(4, 13),
|
||||
(4, 14),
|
||||
(4, 15),
|
||||
(4, 16),
|
||||
(4, 17),
|
||||
(4, 18),
|
||||
(4, 19),
|
||||
(4, 20),
|
||||
(4, 21),
|
||||
(4, 22),
|
||||
(4, 23),
|
||||
(4, 24),
|
||||
(4, 25),
|
||||
(5, 26),
|
||||
(5, 27),
|
||||
(5, 28),
|
||||
(6, 29),
|
||||
(6, 30),
|
||||
(6, 31),
|
||||
(6, 32),
|
||||
(6, 33),
|
||||
(6, 34),
|
||||
(6, 35),
|
||||
(6, 36),
|
||||
(6, 37),
|
||||
(6, 38),
|
||||
(6, 39),
|
||||
(7, 40),
|
||||
(7, 41),
|
||||
(7, 42),
|
||||
(7, 43),
|
||||
(7, 44),
|
||||
(7, 45),
|
||||
(7, 46),
|
||||
(7, 47),
|
||||
(7, 48),
|
||||
(8, 49),
|
||||
(8, 50),
|
||||
(8, 51),
|
||||
(8, 52),
|
||||
(8, 53),
|
||||
(8, 54),
|
||||
(8, 55),
|
||||
(8, 56),
|
||||
(8, 57),
|
||||
(8, 58),
|
||||
(8, 59),
|
||||
(8, 60),
|
||||
(8, 61),
|
||||
(8, 62),
|
||||
(8, 63),
|
||||
(8, 64),
|
||||
(8, 65),
|
||||
(8, 66),
|
||||
(8, 67),
|
||||
(8, 68),
|
||||
(9, 69),
|
||||
(9, 70),
|
||||
(9, 71),
|
||||
(10, 72),
|
||||
(10, 73),
|
||||
(10, 74),
|
||||
(10, 75),
|
||||
(10, 76),
|
||||
(10, 77),
|
||||
(10, 78),
|
||||
(10, 79),
|
||||
(10, 80),
|
||||
(10, 81),
|
||||
(11, 82),
|
||||
(11, 83),
|
||||
(11, 84),
|
||||
(11, 85),
|
||||
(11, 86),
|
||||
(11, 87),
|
||||
(11, 88),
|
||||
(11, 89),
|
||||
(11, 90),
|
||||
(11, 91),
|
||||
(11, 92),
|
||||
(11, 93),
|
||||
(11, 94),
|
||||
(11, 95),
|
||||
(12, 96),
|
||||
(12, 97),
|
||||
(12, 98),
|
||||
(12, 81),
|
||||
(12, 99),
|
||||
(13, 100),
|
||||
(13, 101),
|
||||
(13, 102),
|
||||
(13, 103),
|
||||
(13, 104),
|
||||
(13, 105),
|
||||
(13, 106),
|
||||
(14, 107),
|
||||
(14, 108),
|
||||
(14, 109),
|
||||
(14, 110),
|
||||
(14, 111),
|
||||
(14, 112),
|
||||
(14, 113),
|
||||
(14, 114),
|
||||
(14, 115),
|
||||
(14, 116),
|
||||
(14, 117),
|
||||
(15, 13),
|
||||
(15, 118),
|
||||
(15, 119),
|
||||
(15, 120),
|
||||
(15, 23),
|
||||
(15, 121),
|
||||
(16, 122),
|
||||
(16, 39),
|
||||
(16, 123),
|
||||
(16, 124),
|
||||
(16, 125),
|
||||
(16, 126),
|
||||
(17, 127),
|
||||
(17, 128),
|
||||
(17, 129),
|
||||
(17, 130),
|
||||
(17, 131),
|
||||
(18, 132),
|
||||
(18, 133),
|
||||
(19, 134),
|
||||
(19, 135),
|
||||
(19, 136),
|
||||
(19, 137),
|
||||
(19, 138),
|
||||
(19, 139),
|
||||
(19, 140),
|
||||
(19, 141),
|
||||
(19, 142),
|
||||
(20, 141),
|
||||
(20, 143),
|
||||
(20, 144),
|
||||
(20, 145),
|
||||
(20, 146),
|
||||
(20, 142),
|
||||
(20, 147),
|
||||
(20, 148),
|
||||
(20, 149),
|
||||
(20, 95),
|
||||
(20, 150),
|
||||
(20, 151),
|
||||
(20, 152),
|
||||
(20, 153),
|
||||
(21, 154),
|
||||
(21, 155),
|
||||
(21, 156),
|
||||
(21, 157),
|
||||
(21, 158),
|
||||
(21, 159),
|
||||
(21, 160),
|
||||
(21, 161),
|
||||
(21, 162),
|
||||
(21, 163),
|
||||
(21, 164),
|
||||
(21, 165),
|
||||
(21, 166),
|
||||
(21, 167),
|
||||
(21, 168),
|
||||
(21, 169),
|
||||
(21, 170),
|
||||
(22, 171),
|
||||
(22, 172),
|
||||
(22, 173),
|
||||
(22, 174),
|
||||
(23, 175),
|
||||
(23, 176),
|
||||
(23, 177),
|
||||
(23, 178),
|
||||
(23, 179),
|
||||
(23, 180),
|
||||
(23, 181),
|
||||
(24, 182),
|
||||
(24, 183),
|
||||
(25, 184),
|
||||
(25, 185),
|
||||
(25, 186),
|
||||
(25, 187),
|
||||
(25, 188),
|
||||
(25, 189),
|
||||
(25, 190),
|
||||
(25, 191),
|
||||
(25, 192),
|
||||
(25, 193),
|
||||
(25, 194),
|
||||
(25, 195),
|
||||
(25, 196),
|
||||
(25, 197),
|
||||
(25, 198),
|
||||
(26, 199),
|
||||
(26, 200),
|
||||
(26, 201),
|
||||
(26, 202),
|
||||
(26, 203),
|
||||
(26, 204),
|
||||
(26, 205),
|
||||
(26, 206),
|
||||
(26, 207),
|
||||
(26, 208),
|
||||
(26, 209),
|
||||
(26, 210),
|
||||
(26, 211),
|
||||
(26, 212),
|
||||
(26, 213),
|
||||
(27, 214),
|
||||
(27, 215),
|
||||
(27, 216),
|
||||
(27, 44),
|
||||
(27, 217),
|
||||
(27, 218),
|
||||
(27, 41),
|
||||
(27, 219),
|
||||
(27, 45),
|
||||
(27, 220),
|
||||
(28, 71),
|
||||
(28, 70),
|
||||
(30, 221),
|
||||
(30, 222),
|
||||
(30, 131),
|
||||
(31, 223),
|
||||
(31, 224),
|
||||
(31, 225),
|
||||
(32, 226),
|
||||
(32, 85),
|
||||
(32, 227),
|
||||
(32, 228),
|
||||
(32, 229),
|
||||
(32, 230),
|
||||
(32, 231),
|
||||
(32, 232),
|
||||
(32, 112),
|
||||
(32, 233),
|
||||
(32, 234),
|
||||
(32, 235),
|
||||
(33, 236),
|
||||
(33, 237),
|
||||
(33, 238),
|
||||
(34, 239),
|
||||
(34, 240),
|
||||
(34, 241),
|
||||
(34, 242),
|
||||
(34, 243),
|
||||
(34, 244),
|
||||
(35, 245),
|
||||
(35, 246),
|
||||
(36, 247),
|
||||
(36, 248),
|
||||
(37, 249),
|
||||
(37, 250),
|
||||
(38, 251),
|
||||
(38, 252),
|
||||
(38, 253),
|
||||
(39, 254),
|
||||
(39, 255),
|
||||
(39, 256),
|
||||
(39, 257),
|
||||
(39, 258),
|
||||
(39, 259),
|
||||
(39, 260),
|
||||
(39, 261),
|
||||
(39, 262),
|
||||
(40, 263),
|
||||
(40, 264),
|
||||
(40, 265),
|
||||
(41, 266),
|
||||
(41, 267),
|
||||
(42, 268),
|
||||
(42, 269),
|
||||
(42, 270),
|
||||
(42, 271),
|
||||
(42, 272),
|
||||
(43, 190),
|
||||
(43, 273),
|
||||
(43, 274),
|
||||
(43, 275),
|
||||
(43, 276),
|
||||
(43, 277),
|
||||
(43, 278),
|
||||
(43, 279),
|
||||
(43, 280),
|
||||
(43, 186),
|
||||
(44, 281),
|
||||
(44, 164),
|
||||
(44, 282),
|
||||
(44, 154),
|
||||
(44, 283),
|
||||
(44, 284),
|
||||
(44, 285),
|
||||
(44, 286),
|
||||
(44, 287),
|
||||
(44, 288),
|
||||
(44, 289),
|
||||
(44, 290),
|
||||
(44, 291),
|
||||
(45, 292),
|
||||
(45, 293),
|
||||
(45, 294),
|
||||
(46, 295),
|
||||
(47, 165),
|
||||
(47, 296),
|
||||
(47, 166),
|
||||
(47, 297),
|
||||
(47, 298),
|
||||
(47, 299),
|
||||
(47, 160),
|
||||
(47, 300),
|
||||
(48, 301),
|
||||
(48, 302),
|
||||
(48, 303),
|
||||
(48, 304),
|
||||
(48, 305),
|
||||
(48, 306),
|
||||
(48, 307),
|
||||
(48, 308),
|
||||
(49, 309),
|
||||
(49, 310),
|
||||
(49, 311),
|
||||
(50, 312),
|
||||
(50, 313),
|
||||
(51, 314),
|
||||
(52, 315),
|
||||
(52, 316),
|
||||
(52, 317),
|
||||
(52, 318),
|
||||
(53, 319),
|
||||
(53, 320),
|
||||
(54, 321),
|
||||
(54, 322),
|
||||
(54, 323),
|
||||
(54, 324),
|
||||
(54, 325),
|
||||
(54, 326),
|
||||
(54, 327),
|
||||
(54, 328),
|
||||
(55, 329),
|
||||
(55, 330),
|
||||
(55, 331),
|
||||
(55, 332),
|
||||
(55, 333),
|
||||
(56, 334),
|
||||
(56, 335),
|
||||
(57, 336),
|
||||
(57, 337),
|
||||
(57, 338),
|
||||
(57, 339),
|
||||
(57, 340),
|
||||
(57, 341),
|
||||
(57, 342),
|
||||
(57, 343),
|
||||
(58, 344),
|
||||
(58, 345),
|
||||
(58, 238),
|
||||
(58, 346),
|
||||
(58, 347),
|
||||
(58, 348),
|
||||
(58, 349),
|
||||
(58, 350),
|
||||
(58, 351),
|
||||
(58, 352),
|
||||
(58, 353),
|
||||
(58, 237),
|
||||
(58, 354),
|
||||
(59, 355),
|
||||
(59, 356),
|
||||
(59, 357),
|
||||
(59, 358),
|
||||
(59, 359),
|
||||
(60, 360),
|
||||
(60, 361),
|
||||
(61, 362),
|
||||
(61, 363),
|
||||
(62, 364),
|
||||
(62, 365);
|
||||
|
||||
ALTER TABLE `transcripts`
|
||||
DROP COLUMN `execs`,
|
||||
DROP COLUMN `analysts`;
|
||||
|
||||
DELETE FROM `transcripts` WHERE `id` = 1;
|
||||
DELETE FROM `execs` WHERE `id` = 1;
|
||||
DELETE FROM `analysts` WHERE `id` = 1;
|
||||
DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;
|
||||
DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;
|
208
scraping-alpha/Scraping_Alpha/execs.sql
Normal file
208
scraping-alpha/Scraping_Alpha/execs.sql
Normal file
|
@ -0,0 +1,208 @@
|
|||
INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES
|
||||
(1, 0),
|
||||
(2, 1),
|
||||
(2, 2),
|
||||
(2, 3),
|
||||
(2, 4),
|
||||
(3, 5),
|
||||
(3, 6),
|
||||
(3, 7),
|
||||
(4, 8),
|
||||
(4, 9),
|
||||
(4, 10),
|
||||
(5, 11),
|
||||
(5, 12),
|
||||
(5, 13),
|
||||
(5, 14),
|
||||
(6, 15),
|
||||
(6, 16),
|
||||
(6, 17),
|
||||
(6, 18),
|
||||
(6, 19),
|
||||
(7, 20),
|
||||
(7, 21),
|
||||
(7, 22),
|
||||
(7, 23),
|
||||
(8, 24),
|
||||
(8, 25),
|
||||
(8, 26),
|
||||
(9, 27),
|
||||
(9, 28),
|
||||
(9, 29),
|
||||
(9, 30),
|
||||
(10, 31),
|
||||
(10, 32),
|
||||
(11, 33),
|
||||
(11, 34),
|
||||
(11, 35),
|
||||
(12, 36),
|
||||
(12, 37),
|
||||
(12, 38),
|
||||
(12, 39),
|
||||
(13, 40),
|
||||
(13, 41),
|
||||
(13, 42),
|
||||
(14, 43),
|
||||
(14, 44),
|
||||
(14, 45),
|
||||
(14, 46),
|
||||
(14, 47),
|
||||
(14, 48),
|
||||
(14, 49),
|
||||
(14, 50),
|
||||
(14, 51),
|
||||
(15, 52),
|
||||
(15, 53),
|
||||
(15, 54),
|
||||
(16, 55),
|
||||
(16, 56),
|
||||
(16, 57),
|
||||
(17, 58),
|
||||
(17, 59),
|
||||
(17, 60),
|
||||
(17, 61),
|
||||
(18, 62),
|
||||
(18, 63),
|
||||
(19, 64),
|
||||
(19, 65),
|
||||
(19, 66),
|
||||
(20, 67),
|
||||
(20, 68),
|
||||
(21, 69),
|
||||
(21, 70),
|
||||
(21, 71),
|
||||
(22, 72),
|
||||
(22, 73),
|
||||
(22, 74),
|
||||
(23, 75),
|
||||
(23, 76),
|
||||
(23, 77),
|
||||
(23, 78),
|
||||
(24, 79),
|
||||
(24, 80),
|
||||
(24, 81),
|
||||
(25, 82),
|
||||
(25, 83),
|
||||
(25, 84),
|
||||
(26, 85),
|
||||
(26, 86),
|
||||
(26, 87),
|
||||
(26, 88),
|
||||
(27, 89),
|
||||
(27, 90),
|
||||
(27, 91),
|
||||
(27, 92),
|
||||
(27, 93),
|
||||
(28, 94),
|
||||
(28, 95),
|
||||
(28, 96),
|
||||
(29, 97),
|
||||
(29, 98),
|
||||
(29, 99),
|
||||
(30, 100),
|
||||
(30, 101),
|
||||
(31, 102),
|
||||
(31, 103),
|
||||
(31, 104),
|
||||
(31, 105),
|
||||
(32, 106),
|
||||
(32, 107),
|
||||
(33, 108),
|
||||
(33, 109),
|
||||
(34, 110),
|
||||
(34, 111),
|
||||
(35, 112),
|
||||
(35, 113),
|
||||
(35, 114),
|
||||
(36, 115),
|
||||
(36, 116),
|
||||
(36, 117),
|
||||
(36, 118),
|
||||
(37, 119),
|
||||
(37, 120),
|
||||
(37, 121),
|
||||
(38, 122),
|
||||
(38, 123),
|
||||
(39, 124),
|
||||
(39, 125),
|
||||
(39, 126),
|
||||
(40, 127),
|
||||
(40, 128),
|
||||
(40, 129),
|
||||
(41, 130),
|
||||
(41, 131),
|
||||
(41, 132),
|
||||
(41, 133),
|
||||
(42, 134),
|
||||
(42, 135),
|
||||
(42, 136),
|
||||
(42, 137),
|
||||
(42, 138),
|
||||
(42, 139),
|
||||
(43, 140),
|
||||
(43, 141),
|
||||
(43, 142),
|
||||
(44, 143),
|
||||
(44, 144),
|
||||
(44, 145),
|
||||
(45, 146),
|
||||
(45, 147),
|
||||
(45, 148),
|
||||
(46, 149),
|
||||
(46, 150),
|
||||
(46, 151),
|
||||
(47, 152),
|
||||
(47, 153),
|
||||
(47, 154),
|
||||
(47, 155),
|
||||
(48, 156),
|
||||
(48, 157),
|
||||
(48, 158),
|
||||
(49, 159),
|
||||
(49, 160),
|
||||
(49, 161),
|
||||
(50, 162),
|
||||
(50, 163),
|
||||
(51, 164),
|
||||
(51, 165),
|
||||
(52, 166),
|
||||
(52, 167),
|
||||
(52, 168),
|
||||
(53, 169),
|
||||
(53, 170),
|
||||
(53, 171),
|
||||
(54, 172),
|
||||
(54, 173),
|
||||
(54, 174),
|
||||
(54, 175),
|
||||
(54, 176),
|
||||
(54, 177),
|
||||
(55, 178),
|
||||
(55, 179),
|
||||
(55, 180),
|
||||
(56, 181),
|
||||
(56, 182),
|
||||
(57, 183),
|
||||
(57, 184),
|
||||
(57, 185),
|
||||
(58, 186),
|
||||
(58, 187),
|
||||
(58, 188),
|
||||
(58, 189),
|
||||
(58, 190),
|
||||
(58, 191),
|
||||
(58, 192),
|
||||
(58, 193),
|
||||
(58, 194),
|
||||
(58, 195),
|
||||
(59, 196),
|
||||
(59, 197),
|
||||
(59, 198),
|
||||
(60, 199),
|
||||
(60, 200),
|
||||
(60, 201),
|
||||
(61, 202),
|
||||
(61, 203),
|
||||
(62, 204),
|
||||
(62, 205),
|
||||
(62, 206);
|
|
@ -102,3 +102,10 @@ with open("analysts.sql", 'rb+') as filehandle:
|
|||
filehandle.seek(-1, os.SEEK_END)
|
||||
filehandle.truncate()
|
||||
filehandle.write(";")
|
||||
# `analysts.sql` then performs some cleanup on the database.
|
||||
filehandle.write("\n\nALTER TABLE `transcripts`\n\tDROP COLUMN `execs`,\n\tDROP COLUMN `analysts`;\n\n")
|
||||
filehandle.write("DELETE FROM `transcripts` WHERE `id` = 0;\n")
|
||||
filehandle.write("DELETE FROM `execs` WHERE `id` = 0;\n")
|
||||
filehandle.write("DELETE FROM `analysts` WHERE `id` = 0;\n")
|
||||
filehandle.write("DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;\n")
|
||||
filehandle.write("DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;\n")
|
||||
|
|
Reference in a new issue