美文网首页
[Python与数据分析]-13sklearn异常检测

[Python与数据分析]-13sklearn异常检测

作者: 六千宛 | 来源:发表于2020-08-03 17:41 被阅读0次

IF

#########################IF_JD##########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_JD.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_JD.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################孤立森林异常检测####################
#1-替换样本特征中出现的inf,nan
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
b[np.isinf(b)]=0
b[np.isnan(b)]=0
#2-构造孤立森林
model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model.fit(b)
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=model.predict(b)
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('JD_IF_tsfresh.xlsx')
#########################IF_NJ##########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################孤立森林异常检测####################
#1-替换样本特征中出现的inf,nan
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
b[np.isinf(b)]=0
b[np.isnan(b)]=0
#2-构造孤立森林
model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05),max_features=1.0)
model.fit(b)
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=model.predict(b)
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_IF_tsfresh.xlsx')

LOF

#########################LOF_JD##########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_JD.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_JD.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################LOF异常检测####################
#1-替换样本特征中出现的inf,nan
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from scipy import stats
# fit the model
b[np.isinf(b)]=0
b[np.isnan(b)]=0
model = LocalOutlierFactor(n_neighbors=2518, contamination=float(0.1))
y_pred = model.fit_predict(b)
scores_pred = model.negative_outlier_factor_
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=scores_pred
g['anomaly']=y_pred
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('JD_LOF_tsfresh.xlsx')
#########################LOF_NJ#########################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
###################LOF异常检测####################
#1-替换样本特征中出现的inf,nan
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from scipy import stats
# fit the model
b[np.isinf(b)]=0
b[np.isnan(b)]=0
model = LocalOutlierFactor(n_neighbors=2518, contamination=float(0.1))
y_pred = model.fit_predict(b)
scores_pred = model.negative_outlier_factor_
#3-将检测结果存入新表
g = pd.DataFrame()
g['scores']=scores_pred
g['anomaly']=y_pred
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_LOF_tsfresh.xlsx')

OneClassSVM

#####################OneClassSVM_JD#####################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_JD.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_JD.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
##################OneClassSVM异常检测###################
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
b[np.isinf(b)]=0
b[np.isnan(b)]=0
model = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
model.fit(X_train)
y_pred_train = model.predict(b)
n_error_train = y_pred_train[y_pred_train == -1].size
#将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=y_pred_train
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('JD_OneClassSVM_tsfresh.xlsx')
#####################OneClassSVM_NJ#####################
########################数据处理########################
import pandas as pd
df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
df = df.append(data)
# dic = pd.DataFrame()
# flag = 0
# nums = df.shape[0]#返回行数
r_list = df.keys()[0]
clo_r = df[r_list]#id列
# angle = df[df.keys()[1]]
class_l = list(set(clo_r))
# 设置成“category”数据类型
df['id'] = df['id'].astype('category')
# inplace = True,使 recorder_categories生效
df['id'].cat.reorder_categories(class_l, inplace=True)
# inplace = True,使 df生效
df.sort_values('id', inplace=True)
# 将DataFrame中index重排
df.reset_index(drop=True, inplace=True)
df_new = []
for i in class_l:
    time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
    df_new = df_new + time
df['time'] = df_new
#######################特征选择#########################
from tsfresh import extract_features
extracted_features = extract_features(df, column_id="id", column_sort="time")
a = extracted_features.values.tolist()
import numpy as np
b = np.array(a)
##################OneClassSVM异常检测###################
#fit the model
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
b[np.isinf(b)]=0
b[np.isnan(b)]=0
model = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
model.fit(X_train)
y_pred_train = model.predict(b)
n_error_train = y_pred_train[y_pred_train == -1].size
#将检测结果存入新表
g = pd.DataFrame()
g['scores']=model.decision_function(b)
g['anomaly']=y_pred_train
g['id'] = extracted_features.index.values
order = ['id','scores','anomaly']
g = g[order]
g.to_excel('NJ_OneClassSVM_tsfresh.xlsx')

相关文章

  • [Python与数据分析]-13sklearn异常检测

    IF LOF OneClassSVM

  • (十一) 异常检测分析

    此部分内容来自对《Python数据分析与数据化运营》4.5节 异常检测 的读书笔记。数据中的异常数据通常被认为是异...

  • (十二)时间序列分析

    此部分内容来自对《Python数据分析与数据化运营》4.6节 异常检测 的读书笔记。时间序列是用来研究数据随时间变...

  • 异常检测算法速览(Python代码)

    一、异常检测简介 异常检测是通过数据挖掘方法发现与数据集分布不一致的异常数据,也被称为离群点、异常值检测等等。 1...

  • 风险信用评分卡相关文章总结

    基于Python的信用评分卡建模分析 【评分卡】评分卡入门与创建原则——分箱、WOE、IV、分值分配 异常值检测 ...

  • 机器学习中的两个问题

    最近在使用sklearn的onclassSVM做异常检测的应用。在从最开始提取数据、分析数据、清洗数据、特征提...

  • Python学习(八)

    异常处理 Python中的异常类型总结: Python内置异常类的层次结构: 异常检测 try-except语句 ...

  • 第三章:数据探索

    3.1餐饮销售额数据异常值检测 2数据特征分析 分布分析:分布分析能揭示数据的分布特征和分布类型;定量数据的分布分...

  • 【算法】异常检测

    异常检测 异常检测(Anomaly Detection):异常检测就是从数据集中检测出异常样本,是一种无监督学习。...

  • python电子书汇总

    Python金融大数据分析_雅格书林 Python数据分析与挖掘实战 Python核心编程(第3版) Python...

网友评论

      本文标题:[Python与数据分析]-13sklearn异常检测

      本文链接:https://www.haomeiwen.com/subject/tzorrktx.html