1. pandas读取csv
import pandas as pd
df = pd.read_csv(<filename>)
print df.head() # 头部5行
print df.tail() # 尾部5行
print df.tail(n) # 尾部n行
print df[10:21] # 切片, 10到20行
print df['Close'].max() # 统计出最大收盘价
2. pandas绘制图表
"""Plot High prices for IBM"""
import pandas as pd
import matplotlib.pyplot as plt
def test_run():
df = pd.read_csv("data/IBM.csv")
df['High'].plot()
# df[['Close', 'Adj Close']].plot() #绘制两列
plt.xlabel('Time')
plt.ylabel('High')
plt.title('High prices for IBM')
plt.show() # must be called to show plots
if __name__ == "__main__":
test_run()
csv中日期是从上向下递增的, 所以读进来的日期顺序是反过来的, 曲线也是反过来的, 呈下降趋势, 需进行反转处理.
3. 构建及合并DataFrame, df.join()
- 创建日期范围列
start_date = '2010-01-22'
end_date = '2010-01-26'
dates = pd.date_range(start_date, end_date)
df1 = pd.DataFrame(index = dates) # 不指定index将使用0, 1, 2
date_range()返回datetime索引对象的列表[2010-01-22, ..., 2010-01-26]
In: dates[0]
Out: 2010-01-22 00:00:00 # 股票信息中时间戳可忽略
合并DataFrame:
dfSPY = pd.read_csv("data/SPY.csv", index_col = "Date",
parse_dates = True, usecols = ['Date', 'Adj Close'],
na_values = ['nan'])
df1 = df1.join(dfSPY)
默认向左合并, dfSPY和df1索引中有交集的部分会被合并, 否则填充NaN, 所以会保留df1的全部数据.
SPY.csv默认是以整数进行索引的, 所以需指定索引列index_col, 并指定索引格式是datatime.对于NaN需告诉pandas不能当成一个数处理, 所以na_values指定NaN为字符串格式.
之后使用dropna()直接将df1中为NaN的行去掉:
df1 = df1.dropna()
二者其实可以直接用一句话完成, 即how参数为inner内联, 只取交集, 但是时间顺序为倒序
df1 = df1.join(dfSPY, how = 'inner')
附df.join()中how参数使用用法:
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘left’
How to handle the operation of the two objects.
left: use calling frame’s(调用表) index (or column if on is specified)
right: use other’s index.(被调用表)
outer: form union of calling frame’s index (or column if on is specified) with other’s index, and sort it. lexicographically字典序.
inner: form intersection of calling frame’s index (or column if on is specified) with other’s index, preserving the order of the calling’s one.
在批量读入csv时, 为防止'Adj Close'列名冲突, 需要重新命名为csv文件的名字, 如下图所示:
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
DataFrame重命名
当使用SPY作为参考时,需要将不交易的日期删掉:
# TODO:对SPY去除空白列时需配合how参数为left, 若为inner则日期顺序会倒过来, 使用subset则为正序
df = df.join(df_temp, how='left')
if symbol == 'SPY':
df = df.dropna(subset=['SPY'])
纽约证交所一年交易日252天, SPY除了周末和节假日都开盘, 所以经常用SPY作为日期参考
4. DataFrame切片
- 行切片
df.ix['2010-01-01' : '2010-01-31']
- 列切片
df['GOOG'] # 单列
df[['IBM', 'GLD']] # 多列要传入列表
- 行列同时切片
df.ix['2010-01-01' : '2010-01-31', ['IBM', 'GLD']]
5.图表相关
- 图表归一化:
df1 = df1 / df1[0] # 除以第一行做归一化, 所有曲线都从1美元开始
归一化的目的是凸显股票波动
注意:根据 pandas 语法,该操作应该读成:
df = df / df.ix[0]
或者,更明确地读成:
df = df / df.ix[0, :]
这种方法是基于C层面的, 比二层循环遍历整个图表的python层面更快
- 将df数据传入使用matplotlib.pyplot绘制图表:
import matplotlib.pyplot as plt
def plot_data(df, title="Stock prices"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()
注意: set_xlabel()方法是调用df.plot后返回对象的方法, 可以理解为handler
全部代码如下:
"""Slice and plot"""
import os
import pandas as pd
import matplotlib.pyplot as plt
def plot_selected(df, columns, start_index, end_index):
"""Plot the desired columns over index values in the given range."""
plot_data(df.ix[start_index : end_index, columns], title="Selected data")
def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df
def plot_data(df, title="Stock prices"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()
def normalize_data(df):
"""Normalize stock prices using the first row of the dataframe"""
return df / df.ix[0, :]
def test_run():
# Define a date range
dates = pd.date_range('2010-01-01', '2010-12-31')
# Choose stock symbols to read
symbols = ['GOOG', 'IBM', 'GLD'] # SPY will be added in get_data()
# Get stock data
df = get_data(symbols, dates)
# Normalize stock prices
df = normalize_data(df)
# Slice and plot
plot_selected(df, ['SPY', 'IBM'], '2010-03-01', '2010-04-01')
if __name__ == "__main__":
test_run()
网友评论