panda 可以方便對大亮的資料處理,也提供很多查找的方式.例如 join, split, groupby, columns/rows select, hierarchical index supported... 雖然有點類似 SQL 的想法. 當相對的比 SQL 來的快速跟方便.
不要自己再多寫很多的 query 方式, 且可支援 cvs, HDF5(pytable) compression, json, ... 的資料格式.
ex: 改寫 panda 的範例 算出 move avage ....
"""
Some examples playing around with yahoo finance data
"""
from datetime import datetime
import matplotlib.finance as fin
import numpy as np
from pylab import show
import pprint
from pandas import Index, DataFrame
from pandas.core.datetools import BMonthEnd
from pandas import ols
startDate = datetime(2009, 9, 1)
endDate = datetime(2009, 9, 10)
def getQuotes(symbol, start, end):
quotes = fin.quotes_historical_yahoo(symbol, start, end)
dates, open, close, high, low, volume = zip(*quotes)
data = {
'open' : open,
'close' : close,
'high' : high,
'low' : low,
'volume' : volume
}
dates = Index([datetime.fromordinal(int(d)) for d in dates])
return DataFrame(data, index=dates)
def getMoveAvage(frame, label='close', mvavg=5):
""" get move avage """
assert(label in ['open', 'close', 'high', 'low', 'volume'])
avgs = []
for indx, val in enumerate(frame.index):
tot_sum = 0.0
if indx > mvavg and mvavg >0:
for i in range(mvavg):
tot_sum += frame[label][indx-i]
avgs.append(tot_sum/mvavg)
else:
avgs.append(0.0)
data = {
"%s_avg_%s" %(label,mvavg) : avgs
}
return DataFrame(data, index=frame.index)
msft = getQuotes('MSFT', startDate, endDate)
msft_close_mv5 = getMoveAvage(msft, 'close', 5)
msft_open_mv5 = getMoveAvage(msft, 'open', 5)
new_msft = msft.join(msft_close_mv5)
print new_msft
用 np.sum 來加速 減少 memory access times
....
def getMoveAvage2(frame, label='close', mvavg=5):
""" get move avage """
assert(label in ['open', 'close', 'high', 'low', 'volume'])
avgs = []
for indx, val in enumerate(frame.index):
tot_sum = 0.0
if indx > mvavg and mvavg >0:
tot_sum = np.sum(frame[label][indx-mvavg+1:indx+1])
avgs.append(tot_sum/mvavg)
else:
avgs.append(0.0)
data = {
"%s_avg_%s" %(label,mvavg) : avgs
}
return DataFrame(data, index=frame.index)
#--------------------------------------
msft = getQuotes('MSFT', startDate, endDate)
profile.run("getMoveAvage(msft, 'close', 5)", 'status0')
p0 = pstats.Stats('status0')
p0.sort_stats('time', 'cumulative').print_stats(5)
profile.run("getMoveAvage2(msft, 'close', 5)", 'status1')
p1 = pstats.Stats('status1')
p1.sort_stats('time', 'cumulative').print_stats(5)
rst0 = eval("getMoveAvage(msft, 'close', 5)")
rst1 = eval("getMoveAvage2(msft, 'close', 5)")
refs:
http://pandas.pydata.org/pandas-docs/dev/
http://www.pytables.org/moin