diff --git a/scraping-alpha/Scraping_Alpha/README b/scraping-alpha/Scraping_Alpha/README new file mode 100644 index 0000000..effe841 --- /dev/null +++ b/scraping-alpha/Scraping_Alpha/README @@ -0,0 +1,81 @@ +# Scraping Alpha + +### Author + +Ben Goldsworthy +<[email](mailto:b.goldsworthy@lancaster.ac.uk)> +<[website](http://www.bengoldsworthy.uk/)> + +### Version + +1.0 + +### Abstract + +Scraping Alpha is a series of Python scripts used to scrape +[Seeking Alpha](http://seekingalpha.com/) earnings call transcripts and produce +SQL from them. + +It was created for Dr Lars Hass of the Lancaster University Management School. + +### Usage + +The instructions for each step of the process can be found at the beginning of +each of the files involved: `transcript_spider.py`, `JSONtoSQL.py` and +`execsAndAnalysts.py`. The are repeated here for brevity. + +#### `transcript_spider.py` + +This file is the webspider that Scrapy uses to retrieve the information from the + website. Left unattended, it will scrape all 4,000+ pages of results. +To interrupt this behaviour and still be able to proceed with the other steps, +cancel the script with `CTRL+Z`. This will likely leave an unfinished JSON item +at the end of the output file. To clear this up, open the file in `vim` and type + the following keys: +```vim +G +V +d +$ +i +BACKSPACE +ENTER +] +ESC +:wp +ENTER +``` + +This will truncate the file at the last complete record and seal it off. + +For installation instructions for Scrapy, see +[here](https://doc.scrapy.org/en/latest/intro/install.html). This file should be + in the `spiders` directory of the project, and is run via `scrapy crawl + transcripts -o transcripts.json` at the command line (the output file will be + placed in the directory the Terminal is currently pointing to). + +#### `JSONtoSQL.py` + +This file takes the `transcripts.json` file output of `transcript_spider.py` and + converts it into SQL. + +This file should be located in the same directory as `transcripts.json`, and is +run via `python JSONtoSQL.py > [FILE].sql`, where `[FILE]` is the desired name +of the output file. + +#### `execsAndAnalysts.py` + +First, import the output file of `JSONtoSQL.py` to your chosen DBMS (I've tested + it with phpMyAdmin). Then, run the following query: +```SQL +SELECT `id`, `execs`, `analysts` FROM `transcripts` +``` + +Export the resulting table ([instructions](http://serverfault.com/a/435443)) to +`transcripts.sql`, and place the file in the same directory as +`execsAndAnalysts.py`. Run it with 'python execsAndAnalysts'. + +It creates from this two files (`execs.sql` and `analysts.sql`). Import them +into your DBMS to create two linking tables. The final instruction of +`analysts.sql` then deletes the superfluous `execs` and `analysts` columns from +the `transcripts` table. diff --git a/scraping-alpha/Scraping_Alpha/analysts.sql b/scraping-alpha/Scraping_Alpha/analysts.sql new file mode 100644 index 0000000..d3e71b5 --- /dev/null +++ b/scraping-alpha/Scraping_Alpha/analysts.sql @@ -0,0 +1,401 @@ +INSERT INTO `analysts_to_transcripts` (`analyst_id`, `transcript_id`) VALUES + (1, 0), + (2, 1), + (3, 2), + (3, 3), + (3, 4), + (3, 5), + (3, 6), + (3, 7), + (3, 8), + (3, 9), + (3, 10), + (4, 11), + (4, 12), + (4, 13), + (4, 14), + (4, 15), + (4, 16), + (4, 17), + (4, 18), + (4, 19), + (4, 20), + (4, 21), + (4, 22), + (4, 23), + (4, 24), + (4, 25), + (5, 26), + (5, 27), + (5, 28), + (6, 29), + (6, 30), + (6, 31), + (6, 32), + (6, 33), + (6, 34), + (6, 35), + (6, 36), + (6, 37), + (6, 38), + (6, 39), + (7, 40), + (7, 41), + (7, 42), + (7, 43), + (7, 44), + (7, 45), + (7, 46), + (7, 47), + (7, 48), + (8, 49), + (8, 50), + (8, 51), + (8, 52), + (8, 53), + (8, 54), + (8, 55), + (8, 56), + (8, 57), + (8, 58), + (8, 59), + (8, 60), + (8, 61), + (8, 62), + (8, 63), + (8, 64), + (8, 65), + (8, 66), + (8, 67), + (8, 68), + (9, 69), + (9, 70), + (9, 71), + (10, 72), + (10, 73), + (10, 74), + (10, 75), + (10, 76), + (10, 77), + (10, 78), + (10, 79), + (10, 80), + (10, 81), + (11, 82), + (11, 83), + (11, 84), + (11, 85), + (11, 86), + (11, 87), + (11, 88), + (11, 89), + (11, 90), + (11, 91), + (11, 92), + (11, 93), + (11, 94), + (11, 95), + (12, 96), + (12, 97), + (12, 98), + (12, 81), + (12, 99), + (13, 100), + (13, 101), + (13, 102), + (13, 103), + (13, 104), + (13, 105), + (13, 106), + (14, 107), + (14, 108), + (14, 109), + (14, 110), + (14, 111), + (14, 112), + (14, 113), + (14, 114), + (14, 115), + (14, 116), + (14, 117), + (15, 13), + (15, 118), + (15, 119), + (15, 120), + (15, 23), + (15, 121), + (16, 122), + (16, 39), + (16, 123), + (16, 124), + (16, 125), + (16, 126), + (17, 127), + (17, 128), + (17, 129), + (17, 130), + (17, 131), + (18, 132), + (18, 133), + (19, 134), + (19, 135), + (19, 136), + (19, 137), + (19, 138), + (19, 139), + (19, 140), + (19, 141), + (19, 142), + (20, 141), + (20, 143), + (20, 144), + (20, 145), + (20, 146), + (20, 142), + (20, 147), + (20, 148), + (20, 149), + (20, 95), + (20, 150), + (20, 151), + (20, 152), + (20, 153), + (21, 154), + (21, 155), + (21, 156), + (21, 157), + (21, 158), + (21, 159), + (21, 160), + (21, 161), + (21, 162), + (21, 163), + (21, 164), + (21, 165), + (21, 166), + (21, 167), + (21, 168), + (21, 169), + (21, 170), + (22, 171), + (22, 172), + (22, 173), + (22, 174), + (23, 175), + (23, 176), + (23, 177), + (23, 178), + (23, 179), + (23, 180), + (23, 181), + (24, 182), + (24, 183), + (25, 184), + (25, 185), + (25, 186), + (25, 187), + (25, 188), + (25, 189), + (25, 190), + (25, 191), + (25, 192), + (25, 193), + (25, 194), + (25, 195), + (25, 196), + (25, 197), + (25, 198), + (26, 199), + (26, 200), + (26, 201), + (26, 202), + (26, 203), + (26, 204), + (26, 205), + (26, 206), + (26, 207), + (26, 208), + (26, 209), + (26, 210), + (26, 211), + (26, 212), + (26, 213), + (27, 214), + (27, 215), + (27, 216), + (27, 44), + (27, 217), + (27, 218), + (27, 41), + (27, 219), + (27, 45), + (27, 220), + (28, 71), + (28, 70), + (30, 221), + (30, 222), + (30, 131), + (31, 223), + (31, 224), + (31, 225), + (32, 226), + (32, 85), + (32, 227), + (32, 228), + (32, 229), + (32, 230), + (32, 231), + (32, 232), + (32, 112), + (32, 233), + (32, 234), + (32, 235), + (33, 236), + (33, 237), + (33, 238), + (34, 239), + (34, 240), + (34, 241), + (34, 242), + (34, 243), + (34, 244), + (35, 245), + (35, 246), + (36, 247), + (36, 248), + (37, 249), + (37, 250), + (38, 251), + (38, 252), + (38, 253), + (39, 254), + (39, 255), + (39, 256), + (39, 257), + (39, 258), + (39, 259), + (39, 260), + (39, 261), + (39, 262), + (40, 263), + (40, 264), + (40, 265), + (41, 266), + (41, 267), + (42, 268), + (42, 269), + (42, 270), + (42, 271), + (42, 272), + (43, 190), + (43, 273), + (43, 274), + (43, 275), + (43, 276), + (43, 277), + (43, 278), + (43, 279), + (43, 280), + (43, 186), + (44, 281), + (44, 164), + (44, 282), + (44, 154), + (44, 283), + (44, 284), + (44, 285), + (44, 286), + (44, 287), + (44, 288), + (44, 289), + (44, 290), + (44, 291), + (45, 292), + (45, 293), + (45, 294), + (46, 295), + (47, 165), + (47, 296), + (47, 166), + (47, 297), + (47, 298), + (47, 299), + (47, 160), + (47, 300), + (48, 301), + (48, 302), + (48, 303), + (48, 304), + (48, 305), + (48, 306), + (48, 307), + (48, 308), + (49, 309), + (49, 310), + (49, 311), + (50, 312), + (50, 313), + (51, 314), + (52, 315), + (52, 316), + (52, 317), + (52, 318), + (53, 319), + (53, 320), + (54, 321), + (54, 322), + (54, 323), + (54, 324), + (54, 325), + (54, 326), + (54, 327), + (54, 328), + (55, 329), + (55, 330), + (55, 331), + (55, 332), + (55, 333), + (56, 334), + (56, 335), + (57, 336), + (57, 337), + (57, 338), + (57, 339), + (57, 340), + (57, 341), + (57, 342), + (57, 343), + (58, 344), + (58, 345), + (58, 238), + (58, 346), + (58, 347), + (58, 348), + (58, 349), + (58, 350), + (58, 351), + (58, 352), + (58, 353), + (58, 237), + (58, 354), + (59, 355), + (59, 356), + (59, 357), + (59, 358), + (59, 359), + (60, 360), + (60, 361), + (61, 362), + (61, 363), + (62, 364), + (62, 365); + +ALTER TABLE `transcripts` + DROP COLUMN `execs`, + DROP COLUMN `analysts`; + +DELETE FROM `transcripts` WHERE `id` = 1; +DELETE FROM `execs` WHERE `id` = 1; +DELETE FROM `analysts` WHERE `id` = 1; +DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0; +DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0; diff --git a/scraping-alpha/Scraping_Alpha/execs.sql b/scraping-alpha/Scraping_Alpha/execs.sql new file mode 100644 index 0000000..df4e26f --- /dev/null +++ b/scraping-alpha/Scraping_Alpha/execs.sql @@ -0,0 +1,208 @@ +INSERT INTO `execs_to_transcripts` (`exec_id`, `transcript_id`) VALUES + (1, 0), + (2, 1), + (2, 2), + (2, 3), + (2, 4), + (3, 5), + (3, 6), + (3, 7), + (4, 8), + (4, 9), + (4, 10), + (5, 11), + (5, 12), + (5, 13), + (5, 14), + (6, 15), + (6, 16), + (6, 17), + (6, 18), + (6, 19), + (7, 20), + (7, 21), + (7, 22), + (7, 23), + (8, 24), + (8, 25), + (8, 26), + (9, 27), + (9, 28), + (9, 29), + (9, 30), + (10, 31), + (10, 32), + (11, 33), + (11, 34), + (11, 35), + (12, 36), + (12, 37), + (12, 38), + (12, 39), + (13, 40), + (13, 41), + (13, 42), + (14, 43), + (14, 44), + (14, 45), + (14, 46), + (14, 47), + (14, 48), + (14, 49), + (14, 50), + (14, 51), + (15, 52), + (15, 53), + (15, 54), + (16, 55), + (16, 56), + (16, 57), + (17, 58), + (17, 59), + (17, 60), + (17, 61), + (18, 62), + (18, 63), + (19, 64), + (19, 65), + (19, 66), + (20, 67), + (20, 68), + (21, 69), + (21, 70), + (21, 71), + (22, 72), + (22, 73), + (22, 74), + (23, 75), + (23, 76), + (23, 77), + (23, 78), + (24, 79), + (24, 80), + (24, 81), + (25, 82), + (25, 83), + (25, 84), + (26, 85), + (26, 86), + (26, 87), + (26, 88), + (27, 89), + (27, 90), + (27, 91), + (27, 92), + (27, 93), + (28, 94), + (28, 95), + (28, 96), + (29, 97), + (29, 98), + (29, 99), + (30, 100), + (30, 101), + (31, 102), + (31, 103), + (31, 104), + (31, 105), + (32, 106), + (32, 107), + (33, 108), + (33, 109), + (34, 110), + (34, 111), + (35, 112), + (35, 113), + (35, 114), + (36, 115), + (36, 116), + (36, 117), + (36, 118), + (37, 119), + (37, 120), + (37, 121), + (38, 122), + (38, 123), + (39, 124), + (39, 125), + (39, 126), + (40, 127), + (40, 128), + (40, 129), + (41, 130), + (41, 131), + (41, 132), + (41, 133), + (42, 134), + (42, 135), + (42, 136), + (42, 137), + (42, 138), + (42, 139), + (43, 140), + (43, 141), + (43, 142), + (44, 143), + (44, 144), + (44, 145), + (45, 146), + (45, 147), + (45, 148), + (46, 149), + (46, 150), + (46, 151), + (47, 152), + (47, 153), + (47, 154), + (47, 155), + (48, 156), + (48, 157), + (48, 158), + (49, 159), + (49, 160), + (49, 161), + (50, 162), + (50, 163), + (51, 164), + (51, 165), + (52, 166), + (52, 167), + (52, 168), + (53, 169), + (53, 170), + (53, 171), + (54, 172), + (54, 173), + (54, 174), + (54, 175), + (54, 176), + (54, 177), + (55, 178), + (55, 179), + (55, 180), + (56, 181), + (56, 182), + (57, 183), + (57, 184), + (57, 185), + (58, 186), + (58, 187), + (58, 188), + (58, 189), + (58, 190), + (58, 191), + (58, 192), + (58, 193), + (58, 194), + (58, 195), + (59, 196), + (59, 197), + (59, 198), + (60, 199), + (60, 200), + (60, 201), + (61, 202), + (61, 203), + (62, 204), + (62, 205), + (62, 206); \ No newline at end of file diff --git a/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py b/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py index 1f97469..30fde31 100644 --- a/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py +++ b/scraping-alpha/Scraping_Alpha/execsAndAnalysts.py @@ -102,3 +102,10 @@ with open("analysts.sql", 'rb+') as filehandle: filehandle.seek(-1, os.SEEK_END) filehandle.truncate() filehandle.write(";") + # `analysts.sql` then performs some cleanup on the database. + filehandle.write("\n\nALTER TABLE `transcripts`\n\tDROP COLUMN `execs`,\n\tDROP COLUMN `analysts`;\n\n") + filehandle.write("DELETE FROM `transcripts` WHERE `id` = 0;\n") + filehandle.write("DELETE FROM `execs` WHERE `id` = 0;\n") + filehandle.write("DELETE FROM `analysts` WHERE `id` = 0;\n") + filehandle.write("DELETE FROM `execs_to_transcripts` WHERE `transcript_id` = 0;\n") + filehandle.write("DELETE FROM `analysts_to_transcripts` WHERE `transcript_id` = 0;\n")