美文网首页
[Python与数据分析]-10IF

[Python与数据分析]-10IF

作者: 六千宛 | 来源:发表于2020-08-06 18:08 被阅读0次

@官方

#!/usr/bin/python
# -*- coding:utf-8 -*-
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from scipy import stats
 
rng = np.random.RandomState(42)
 
# 构造训练样本
n_samples = 200  #样本总数
outliers_fraction = 0.25  #异常样本比例
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
 
X = 0.3 * rng.randn(n_inliers // 2, 2)
X_train = np.r_[X + 2, X - 2]   #正常样本
X_train = np.r_[X_train, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]  #正常样本加上异常样本
 
# fit the model
clf = IsolationForest(max_samples=n_samples, random_state=rng, contamination=outliers_fraction)
clf.fit(X_train)
# y_pred_train = clf.predict(X_train)
scores_pred = clf.decision_function(X_train)
threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)  #根据训练样本中异常样本比例,得到阈值,用于绘图
 
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
 
plt.title("IsolationForest")
# plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)  #绘制异常点区域,值从最小的到阈值的那部分
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')  #绘制异常点区域和正常点区域的边界
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='palevioletred')  #绘制正常点区域,值从阈值到最大的那部分
 
b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',
                    s=20, edgecolor='k')
c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',
                    s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-7, 7))
plt.ylim((-7, 7))
plt.legend([a.collections[0], b, c],
           ['learned decision function', 'true inliers', 'true outliers'],
           loc="upper left")
plt.show()
图片.png

anthor:usst2019zp_l@163.com

#########################IF_JD##########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################孤立森林异常检测####################
#1-替换样本特征中出现的inf,nan
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
b[np.isinf(b)]=0
b[np.isnan(b)]=0
#2-构造孤立森林
model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05),max_features=1.0)
model.fit(b)
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=model.predict(b)
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_IF_tsfresh.xlsx')
#########################IF_NJ##########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################孤立森林异常检测####################
#1-替换样本特征中出现的inf,nan
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
b[np.isinf(b)]=0
b[np.isnan(b)]=0
#2-构造孤立森林
model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05),max_features=1.0)
model.fit(b)
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=model.predict(b)
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_IF_tsfresh.xlsx')

相关文章

网友评论

      本文标题:[Python与数据分析]-10IF

      本文链接:https://www.haomeiwen.com/subject/sejrrktx.html