2012年10月16日 星期二

panda for data access

panda 可以方便對大亮的資料處理,也提供很多查找的方式.例如 join, split, groupby, columns/rows select, hierarchical index supported... 雖然有點類似 SQL 的想法. 當相對的比 SQL 來的快速跟方便. 不要自己再多寫很多的 query 方式, 且可支援 cvs, HDF5(pytable) compression, json, ... 的資料格式. ex: 改寫 panda 的範例 算出 move avage ....
"""
Some examples playing around with yahoo finance data
"""

from datetime import datetime

import matplotlib.finance as fin
import numpy as np
from pylab import show
import pprint

from pandas import Index, DataFrame
from pandas.core.datetools import BMonthEnd
from pandas import ols

startDate = datetime(2009, 9, 1)
endDate = datetime(2009, 9, 10)

def getQuotes(symbol, start, end):
    quotes = fin.quotes_historical_yahoo(symbol, start, end)
    dates, open, close, high, low, volume = zip(*quotes)

    data = {
        'open' : open,
        'close' : close,
        'high' : high,
        'low' : low,
        'volume' : volume
    }

    dates = Index([datetime.fromordinal(int(d)) for d in dates])
    return DataFrame(data, index=dates)


def getMoveAvage(frame, label='close', mvavg=5):
    """ get move avage """

    assert(label in ['open', 'close', 'high', 'low', 'volume'])

    avgs    = []

    for indx, val in enumerate(frame.index):
        tot_sum = 0.0

        if indx > mvavg and mvavg >0:
            for i in range(mvavg):
                tot_sum += frame[label][indx-i]

            avgs.append(tot_sum/mvavg)

        else:
            avgs.append(0.0)

    data = {
            "%s_avg_%s" %(label,mvavg)  : avgs
            }

    return DataFrame(data, index=frame.index)


msft = getQuotes('MSFT', startDate, endDate)
msft_close_mv5 = getMoveAvage(msft, 'close', 5)
msft_open_mv5 = getMoveAvage(msft, 'open', 5)

new_msft = msft.join(msft_close_mv5)
print new_msft
用 np.sum 來加速 減少 memory access times
....

def getMoveAvage2(frame, label='close', mvavg=5):
    """ get move avage """

    assert(label in ['open', 'close', 'high', 'low', 'volume'])

    avgs    = []

    for indx, val in enumerate(frame.index):
        tot_sum = 0.0

        if indx > mvavg and mvavg >0:
            tot_sum = np.sum(frame[label][indx-mvavg+1:indx+1])

            avgs.append(tot_sum/mvavg)

        else:
            avgs.append(0.0)

    data = {
            "%s_avg_%s" %(label,mvavg)  : avgs
            }

    return DataFrame(data, index=frame.index)

#--------------------------------------

msft = getQuotes('MSFT', startDate, endDate)

profile.run("getMoveAvage(msft, 'close', 5)", 'status0')
p0 = pstats.Stats('status0')
p0.sort_stats('time', 'cumulative').print_stats(5)

profile.run("getMoveAvage2(msft, 'close', 5)", 'status1')
p1 = pstats.Stats('status1')
p1.sort_stats('time', 'cumulative').print_stats(5)

rst0 = eval("getMoveAvage(msft, 'close', 5)")
rst1 = eval("getMoveAvage2(msft, 'close', 5)")

refs: http://pandas.pydata.org/pandas-docs/dev/ http://www.pytables.org/moin

沒有留言:

張貼留言