blob: 93b367dc04265772b3898995a927be277471f1c0 [file] [log] [blame]
"""
Import historical stock data from yahoo finance.
"""
from datetime import datetime
from pandas.io import data as pdata
import argparse
import numpy
import predictionio
import pytz
import sys
import time
EPOCH = datetime(1970, 1, 1, tzinfo=pytz.utc)
SP500_LIST = [
"A", "AA", "AAPL", "ABBV", "ABC", "ABT", "ACE", "ACN", "ACT", "ADBE", "ADI",
"ADM", "ADP", "ADS", "ADSK", "ADT", "AEE", "AEP", "AES", "AET", "AFL",
"AGN", "AIG", "AIV", "AIZ", "AKAM", "ALL", "ALLE", "ALTR", "ALXN", "AMAT",
"AME", "AMGN", "AMP", "AMT", "AMZN", "AN", "AON", "APA", "APC", "APD",
"APH", "ARG", "ATI", "AVB", "AVP", "AVY", "AXP", "AZO", "BA", "BAC", "BAX",
"BBBY", "BBT", "BBY", "BCR", "BDX", "BEAM", "BEN", "BF-B", "BHI", "BIIB",
"BK", "BLK", "BLL", "BMS", "BMY", "BRCM", "BRK-B", "BSX", "BTU", "BWA",
"BXP", "C", "CA", "CAG", "CAH", "CAM", "CAT", "CB", "CBG", "CBS", "CCE",
"CCI", "CCL", "CELG", "CERN", "CF", "CFN", "CHK", "CHRW", "CI", "CINF",
"CL", "CLX", "CMA", "CMCSA", "CME", "CMG", "CMI", "CMS", "CNP", "CNX",
"COF", "COG", "COH", "COL", "COP", "COST", "COV", "CPB", "CRM", "CSC",
"CSCO", "CSX", "CTAS", "CTL", "CTSH", "CTXS", "CVC", "CVS", "CVX", "D",
"DAL", "DD", "DE", "DFS", "DG", "DGX", "DHI", "DHR", "DIS", "DISCA", "DLPH",
"DLTR", "DNB", "DNR", "DO", "DOV", "DOW", "DPS", "DRI", "DTE", "DTV", "DUK",
"DVA", "DVN", "EA", "EBAY", "ECL", "ED", "EFX", "EIX", "EL", "EMC", "EMN",
"EMR", "EOG", "EQR", "EQT", "ESRX", "ESS", "ESV", "ETFC", "ETN", "ETR",
"EW", "EXC", "EXPD", "EXPE", "F", "FAST", "FB", "FCX", "FDO", "FDX", "FE",
"FFIV", "FIS", "FISV", "FITB", "FLIR", "FLR", "FLS", "FMC", "FOSL", "FOXA",
"FRX", "FSLR", "FTI", "FTR", "GAS", "GCI", "GD", "GE", "GGP", "GHC", "GILD",
"GIS", "GLW", "GM", "GMCR", "GME", "GNW", "GOOG", "GOOGL", "GPC", "GPS",
"GRMN", "GS", "GT", "GWW", "HAL", "HAR", "HAS", "HBAN", "HCBK", "HCN",
"HCP", "HD", "HES", "HIG", "HOG", "HON", "HOT", "HP", "HPQ", "HRB", "HRL",
"HRS", "HSP", "HST", "HSY", "HUM", "IBM", "ICE", "IFF", "IGT", "INTC",
"INTU", "IP", "IPG", "IR", "IRM", "ISRG", "ITW", "IVZ", "JBL", "JCI", "JEC",
"JNJ", "JNPR", "JOY", "JPM", "JWN", "K", "KEY", "KIM", "KLAC", "KMB", "KMI",
"KMX", "KO", "KORS", "KR", "KRFT", "KSS", "KSU", "L", "LB", "LEG", "LEN",
"LH", "LLL", "LLTC", "LLY", "LM", "LMT", "LNC", "LO", "LOW", "LRCX", "LSI",
"LUK", "LUV", "LYB", "M", "MA", "MAC", "MAR", "MAS", "MAT", "MCD", "MCHP",
"MCK", "MCO", "MDLZ", "MDT", "MET", "MHFI", "MHK", "MJN", "MKC", "MMC",
"MMM", "MNST", "MO", "MON", "MOS", "MPC", "MRK", "MRO", "MS", "MSFT", "MSI",
"MTB", "MU", "MUR", "MWV", "MYL", "NBL", "NBR", "NDAQ", "NE", "NEE", "NEM",
"NFLX", "NFX", "NI", "NKE", "NLSN", "NOC", "NOV", "NRG", "NSC", "NTAP",
"NTRS", "NU", "NUE", "NVDA", "NWL", "NWSA", "OI", "OKE", "OMC", "ORCL",
"ORLY", "OXY", "PAYX", "PBCT", "PBI", "PCAR", "PCG", "PCL", "PCLN", "PCP",
"PDCO", "PEG", "PEP", "PETM", "PFE", "PFG", "PG", "PGR", "PH", "PHM", "PKI",
"PLD", "PLL", "PM", "PNC", "PNR", "PNW", "POM", "PPG", "PPL", "PRGO", "PRU",
"PSA", "PSX", "PVH", "PWR", "PX", "PXD", "QCOM", "QEP", "R", "RAI", "RDC",
"REGN", "RF", "RHI", "RHT", "RIG", "RL", "ROK", "ROP", "ROST", "RRC", "RSG",
"RTN", "SBUX", "SCG", "SCHW", "SE", "SEE", "SHW", "SIAL", "SJM", "SLB",
"SLM", "SNA", "SNDK", "SNI", "SO", "SPG", "SPLS", "SRCL", "SRE", "STI",
"STJ", "STT", "STX", "STZ", "SWK", "SWN", "SWY", "SYK", "SYMC", "SYY", "T",
"TAP", "TDC", "TE", "TEG", "TEL", "TGT", "THC", "TIF", "TJX", "TMK", "TMO",
"TRIP", "TROW", "TRV", "TSCO", "TSN", "TSO", "TSS", "TWC", "TWX", "TXN",
"TXT", "TYC", "UNH", "UNM", "UNP", "UPS", "URBN", "USB", "UTX", "V", "VAR",
"VFC", "VIAB", "VLO", "VMC", "VNO", "VRSN", "VRTX", "VTR", "VZ", "WAG",
"WAT", "WDC", "WEC", "WFC", "WFM", "WHR", "WIN", "WLP", "WM", "WMB", "WMT",
"WU", "WY", "WYN", "WYNN", "X", "XEL", "XL", "XLNX", "XOM", "XRAY", "XRX",
"XYL", "YHOO", "YUM", "ZION", "ZMH", "ZTS"]
ETF_LIST = ["QQQ", "SPY", "XLY", "XLP", "XLE", "XLF", "XLV",
"XLI", "XLB", "XLK", "XLU"]
def since_epoch(dt):
return (dt - EPOCH).total_seconds()
def import_data(client, access_key, ticker, start_time, end_time, event_time):
print "Importing:", ticker, start_time, end_time
try:
df = pdata.DataReader(ticker, 'yahoo', start_time, end_time)
print "Extracted:", df.index[0], df.index[-1]
except IOError, ex:
print ex
print "Data not exist. Returning"
return
# assume we only extract US data
eastern = pytz.timezone('US/Eastern')
columns = [
('Open', 'open'),
('High', 'high'),
('Low', 'low'),
('Close', 'close'),
('Volume', 'volume'),
('Adj Close', 'adjclose')]
yahoo_data = dict()
yahoo_data['ticker'] = ticker
yahoo_data['t'] = [
# hour=16 to indicate market close time
since_epoch(eastern.localize(date_.to_pydatetime().replace(hour=16)))
for date_ in df.index]
for column in columns:
yahoo_data[column[1]] = map(numpy.asscalar, df[column[0]].values)
properties = {'yahoo': yahoo_data}
response = client.create_event(
event='$set',
entity_type='yahoo',
entity_id=ticker,
properties=properties,
event_time=event_time.replace(tzinfo=pytz.utc))
print(response)
def import_all(access_key):
"""This method import all SP500 stocks and some SPDR ETFs."""
time_slices = [
(datetime(1999, 1, 1), datetime(2004, 1, 1), datetime(2004, 1, 2)),
(datetime(2003, 12, 1), datetime(2009, 1, 1), datetime(2009, 1, 2)),
(datetime(2008, 12, 1), datetime(2014, 9, 1), datetime(2014, 9, 2)),
]
url = 'http://localhost:7070'
client = predictionio.EventClient(access_key=access_key, threads=1, url=url)
tickers = SP500_LIST + ETF_LIST
for ticker in tickers:
for time_slice in time_slices:
import_data(client, access_key, ticker,
time_slice[0], time_slice[1], time_slice[2])
def import_data_with_gaps(access_key):
"""This method import data with time gaps.
Data imported by this method is used by stock engine, it demonsrates how it
can handle time series data with gaps.
"""
# time_slices is discontinuted
# startTime, endTime, eventDate
time_slices = [
(datetime(2013, 12, 1), datetime(2014, 2, 1), datetime(2014, 2, 2)),
(datetime(2014, 1, 1), datetime(2014, 1, 20), datetime(2014, 2, 10)),
(datetime(2014, 1, 10), datetime(2014, 2, 20), datetime(2014, 2, 28)),
(datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)),
(datetime(2014, 5, 1), datetime(2014, 6, 15), datetime(2014, 6, 20)),
(datetime(2014, 6, 1), datetime(2014, 7, 1), datetime(2014, 7, 15)),
]
tickers = ['SPY', 'AAPL', 'IBM', 'MSFT']
url = 'http://localhost:7070'
client = predictionio.EventClient(access_key=access_key, threads=1, url=url)
for ticker in tickers:
for time_slice in time_slices:
import_data(client, access_key, ticker,
time_slice[0], time_slice[1], time_slice[2])
# below are data with holes
time_slices = [
(datetime(2014, 1, 1), datetime(2014, 1, 20), datetime(2014, 2, 10)),
(datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)),
(datetime(2014, 6, 1), datetime(2014, 7, 1), datetime(2014, 7, 15)),
]
tickers = ['AMZN']
for ticker in tickers:
for time_slice in time_slices:
import_data(client, access_key, ticker,
time_slice[0], time_slice[1], time_slice[2])
time_slices = [
(datetime(2014, 1, 10), datetime(2014, 2, 20), datetime(2014, 2, 28)),
(datetime(2014, 2, 10), datetime(2014, 3, 31), datetime(2014, 4, 2)),
]
tickers = ['FB']
for ticker in tickers:
for time_slice in time_slices:
import_data(client, access_key, ticker,
time_slice[0], time_slice[1], time_slice[2])
def import_one(access_key):
"""Import TSLA.
Import data with from 2014-01-01 until 2014-03-01. event_time specifies when
this data is extracted.
"""
start_time = datetime(2014, 1, 1)
end_time = datetime(2014, 3, 1)
event_time = datetime(2014, 9, 1)
ticker = 'TSLA'
url = 'http://localhost:7070'
client = predictionio.EventClient(access_key=access_key, threads=1, url=url)
import_data(client, access_key, ticker, start_time, end_time, event_time)
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.exit("Usage: python -m examples.import_yahoo <access_key>")
access_key = sys.argv[1]
import_all(access_key=access_key)
#import_data_with_gaps(access_key=access_key)
#import_one(access_key=access_key)