读取
数据文件的格式千奇百怪,不同的数据文件读取方式也不一样。
如果数据文件提供的是csv\txt\xls\xlsx\pkl这类常见格式,可以用pandas的相应方法读取
import pandas as pd
def load_data(filename):
'''
读取文件
'''
file = os.path.splitext(filename)
preffix, postfix = file
postfix = postfix.lower()
if postfix == '.csv':
data = pd.read_csv(os.path.join(FILE_DIR, filename))
elif postfix == '.txt':
data = pd.read_table(os.path.join(FILE_DIR, filename))
elif postfix == '.xls' or postfix == '.xlsx':
data = pd.read_excel(os.path.join(FILE_DIR, filename))
elif postfix == '.pkl':
data = pd.read_pickle(os.path.join(FILE_DIR, filename))
return data
有时候也会遇到一些其他格式的数据,例如mat格式
## 方法一:scipy方法
import scipy.io as scio
data_path="train.mat"
data = scio.loadmat(data_path)
data_train_label=data_train.get('label') # 取出字典里的label
data_train_data=data_train.get('data') # 取出字典里的data
## 方法二:mat4py方法
import mat4py
data2 = mat4py.loadmat('student.mat')
student = data2['student']
## 方法三:h5py方法
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import h5py
datapath2 = 'data10.mat'
file = h5py.File(datapath2,'r') # 读取
data3 = file['CH01'][:]
datapath3 = 'data3.txt'
dfdata = pd.DataFrame(data3)
dfdata.to_csv(datapath1) # 写成csv格式
生成
有时候我们为了测试算法,需要找一些特定分布的数据,这时候可以采用数据生成方法。
随机样本生成:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
# make_classification
X, Y = make_classification(n_samples=100,
n_features=20, # 特征个数
n_informative=2, # 多信息特征的个数
n_redundant=2, # 冗余信息,informative特征的随机线性组合
n_repeated=0, # 重复信息,随机提取n_informative和n_redundant 特征
n_classes=2, # 分类类别
n_clusters_per_class=2, # 某一个类别是由几个cluster构成的
weights=None,
flip_y=0.01,
class_sep=1.0,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=None)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.show()
![](https://img.haomeiwen.com/i9210113/f19ba2c4424bb8c0.png)
高斯分布样本生成:
import matplotlib.pyplot as plt
from sklearn.datasets import make_gaussian_quantiles
# make_gaussian_quantiles
X, Y = make_gaussian_quantiles(mean=None,
cov=1.0,
n_samples=100,
n_features=2,
n_classes=3,
shuffle=True,
random_state=None)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.show()
![](https://img.haomeiwen.com/i9210113/13a950b9fca6cc7f.png)
环形分布样本生成:
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
# make_classification
X, Y = make_circles(n_samples=100,
shuffle=True,
noise=None,
random_state=None,
factor=0.8) # factor :外圈与内圈的尺度因子<1
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.show()
![](https://img.haomeiwen.com/i9210113/39869914f61ebbd2.png)
半环形分布样本生成:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
# make_classification
X, Y = make_moons(n_samples=100,
shuffle=True,
noise=None,
random_state=None
)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.show()
![](https://img.haomeiwen.com/i9210113/7e5e26e0ad07d382.png)
根据公式生成需要的点集:
# -*- coding: UTF-8 -*-
import matplotlib.pyplot as plt
x_values = list(range(1, 1001))
y_values = [x**2 for x in x_values]
plt.scatter(x_values, y_values, s=40)
# 设置每个坐标的取值范围
plt.axis([0, 1100, 0, 1100000])
plt.show()
![](https://img.haomeiwen.com/i9210113/fe7e2693e2000dba.png)
# 正态分布数据集
import numpy as np
import matplotlib.pyplot as plt
mu = 1 #期望为1
sigma = 3 #标准差为3
num = 10000 #个数为10000
rand_data = np.random.normal(mu, sigma, num)
count, bins, ignored = plt.hist(rand_data, 30, normed=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *np.exp( - (bins - mu)**2 / (2 * sigma**2)), linewidth=2, color='r')
plt.show()
![](https://img.haomeiwen.com/i9210113/14ba918f6f56082a.png)
网友评论