This repository has been archived on 2022-08-01. You can view files and clone it, but cannot push or open issues or pull requests.
Scraping-Alpha/Scraping_Alpha/JSONtoSQL.py

127 lines
3.8 KiB
Python

#
# Title: Scraping Alpha
# Version: 1.0
# Author: Ben Goldsworthy <b.goldsworthy@lancaster.ac.uk>
#
# This file is a part of Scraping Alpha, a series of scripts to scrape
# earnings call transcripts from seekingalpha.com and present them as useful
# SQL.
#
# This file takes the `transcripts.json` file output of `transcript_spider.py`
# and converts it into SQL.
#
# This file should be located in the same directory as `transcripts.json`, and
# is run via 'python JSONtoSQL.py > [FILE].sql', where '[FILE]' is the name of
# the output file.
#
import json
import sys
import codecs
sys.stdout=codecs.getwriter('utf-8')(sys.stdout)
json_data=open('transcripts.json').read()
data = json.loads(json_data)
executives = []
analysts = []
# For each transcript, creates new, separate arrays of executives and analysts
# for their own database tables, replacing their tuples in the transcript with
# their database keys.
for entry in data:
indexExec = len(executives)+1
indexAnal = len(analysts)+1
newExecs = []
for executive in entry['entry']['exec']:
if executive not in executives:
executives.append(executive)
newExecs.append(indexExec)
indexExec += 1
else:
newExecs.append(executives.index(executive) + 1)
entry['entry']['exec'] = newExecs
newAnals = []
for analyst in entry['entry']['analysts']:
if analyst not in analysts:
analysts.append(analyst)
newAnals.append(indexAnal)
indexAnal += 1
else:
newAnals.append(analysts.index(analyst) + 1)
entry['entry']['analysts'] = newAnals
# Outputs the SQL file that creates the various tables and populates them with
# INSERT statements.
print "CREATE TABLE IF NOT EXISTS `execs`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`position` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `execs` (`name`, `position`, `company`) VALUES"
print "\t(0,0,0)",
for executive in executives:
print ","
print "\t(\""+executive[0]+"\",\""+executive[1]+"\",\""+executive[2]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `analysts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`name` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `analysts` (`name`, `company`) VALUES"
print "\t(0,0)",
for analyst in analysts:
print ","
print "\t(\""+analyst[0]+"\",\""+analyst[1]+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `transcripts`"
print "("
print "\t`id` INT NOT NULL UNIQUE AUTO_INCREMENT,"
print "\t`title` VARCHAR(255),"
print "\t`company` VARCHAR(255),"
print "\t`execs` VARCHAR(255),"
print "\t`analysts` VARCHAR(255),"
print "\t`transcript` TEXT,"
print "\tPRIMARY KEY(`id`)"
print ");\n"
print "INSERT INTO `transcripts` (`title`, `company`, `execs`, `analysts`, `transcript`) VALUES"
print "\t(0,0,0,0,0)",
for entry in data:
tran = entry['entry']
print ","
print "\t(\""+tran['title']+"\",\""+tran['company']+"\",\""+(';'.join(str(x) for x in tran['exec']))+"\",\""+(';'.join(str(x) for x in tran['analysts']))+"\",\""+tran['transcript']+"\")",
print ";\n"
print "CREATE TABLE IF NOT EXISTS `execs_to_transcripts`"
print "("
print "\t`exec_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`exec_id`, `transcript_id`),"
print "\tFOREIGN KEY (`exec_id`) REFERENCES `execs`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");\n"
print "CREATE TABLE IF NOT EXISTS `analysts_to_transcripts`"
print "("
print "\t`analyst_id` INT NOT NULL,"
print "\t`transcript_id` INT NOT NULL,"
print "\tPRIMARY KEY(`analyst_id`, `transcript_id`),"
print "\tFOREIGN KEY (`analyst_id`) REFERENCES `analysts`(`id`),"
print "\tFOREIGN KEY (`transcript_id`) REFERENCES `transcripts`(`id`)"
print ");"