美文网首页
[Python与数据分析]-10IF

[Python与数据分析]-10IF

作者: 六千宛 | 来源:发表于2020-08-06 18:08 被阅读0次

    @官方

    #!/usr/bin/python
    # -*- coding:utf-8 -*-
     
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.ensemble import IsolationForest
    from scipy import stats
     
    rng = np.random.RandomState(42)
     
    # 构造训练样本
    n_samples = 200  #样本总数
    outliers_fraction = 0.25  #异常样本比例
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)
     
    X = 0.3 * rng.randn(n_inliers // 2, 2)
    X_train = np.r_[X + 2, X - 2]   #正常样本
    X_train = np.r_[X_train, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]  #正常样本加上异常样本
     
    # fit the model
    clf = IsolationForest(max_samples=n_samples, random_state=rng, contamination=outliers_fraction)
    clf.fit(X_train)
    # y_pred_train = clf.predict(X_train)
    scores_pred = clf.decision_function(X_train)
    threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)  #根据训练样本中异常样本比例,得到阈值,用于绘图
     
    # plot the line, the samples, and the nearest vectors to the plane
    xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
     
    plt.title("IsolationForest")
    # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)  #绘制异常点区域,值从最小的到阈值的那部分
    a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')  #绘制异常点区域和正常点区域的边界
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='palevioletred')  #绘制正常点区域,值从阈值到最大的那部分
     
    b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',
                        s=20, edgecolor='k')
    c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',
                        s=20, edgecolor='k')
    plt.axis('tight')
    plt.xlim((-7, 7))
    plt.ylim((-7, 7))
    plt.legend([a.collections[0], b, c],
               ['learned decision function', 'true inliers', 'true outliers'],
               loc="upper left")
    plt.show()
    
    图片.png

    anthor:usst2019zp_l@163.com

    #########################IF_JD##########################
    ########################数据处理########################
    import pandas as pd
    df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
    data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
    df = df.append(data)
    # dic = pd.DataFrame()
    # flag = 0
    # nums = df.shape[0]#返回行数
    r_list = df.keys()[0]
    clo_r = df[r_list]#id列
    # angle = df[df.keys()[1]]
    class_l = list(set(clo_r))
    # 设置成“category”数据类型
    df['id'] = df['id'].astype('category')
    # inplace = True,使 recorder_categories生效
    df['id'].cat.reorder_categories(class_l, inplace=True)
    # inplace = True,使 df生效
    df.sort_values('id', inplace=True)
    # 将DataFrame中index重排
    df.reset_index(drop=True, inplace=True)
    df_new = []
    for i in class_l:
        time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
        df_new = df_new + time
    df['time'] = df_new
    #######################特征选择#########################
    from tsfresh import extract_features
    extracted_features = extract_features(df, column_id="id", column_sort="time")
    a = extracted_features.values.tolist()
    import numpy as np
    b = np.array(a)
    ###################孤立森林异常检测####################
    #1-替换样本特征中出现的inf,nan
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.ensemble import IsolationForest
    b[np.isinf(b)]=0
    b[np.isnan(b)]=0
    #2-构造孤立森林
    model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05),max_features=1.0)
    model.fit(b)
    #3-将检测结果存入新表
    g = pd.DataFrame()
    g['scores']=model.decision_function(b)
    g['anomaly']=model.predict(b)
    g['id'] = extracted_features.index.values
    order = ['id','scores','anomaly']
    g = g[order]
    g.to_excel('NJ_IF_tsfresh.xlsx')
    #########################IF_NJ##########################
    ########################数据处理########################
    import pandas as pd
    df = pd.read_excel('4_NJ.xlsx',sheet_name = 'Sheet1')
    data = pd.read_excel('OP011_NJ.xlsx',sheet_name = 'Sheet1')
    df = df.append(data)
    # dic = pd.DataFrame()
    # flag = 0
    # nums = df.shape[0]#返回行数
    r_list = df.keys()[0]
    clo_r = df[r_list]#id列
    # angle = df[df.keys()[1]]
    class_l = list(set(clo_r))
    # 设置成“category”数据类型
    df['id'] = df['id'].astype('category')
    # inplace = True,使 recorder_categories生效
    df['id'].cat.reorder_categories(class_l, inplace=True)
    # inplace = True,使 df生效
    df.sort_values('id', inplace=True)
    # 将DataFrame中index重排
    df.reset_index(drop=True, inplace=True)
    df_new = []
    for i in class_l:
        time = list(range(1,1+df.loc[df['id'] == i].shape[0]))
        df_new = df_new + time
    df['time'] = df_new
    #######################特征选择#########################
    from tsfresh import extract_features
    extracted_features = extract_features(df, column_id="id", column_sort="time")
    a = extracted_features.values.tolist()
    import numpy as np
    b = np.array(a)
    ###################孤立森林异常检测####################
    #1-替换样本特征中出现的inf,nan
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.ensemble import IsolationForest
    b[np.isinf(b)]=0
    b[np.isnan(b)]=0
    #2-构造孤立森林
    model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.05),max_features=1.0)
    model.fit(b)
    #3-将检测结果存入新表
    g = pd.DataFrame()
    g['scores']=model.decision_function(b)
    g['anomaly']=model.predict(b)
    g['id'] = extracted_features.index.values
    order = ['id','scores','anomaly']
    g = g[order]
    g.to_excel('NJ_IF_tsfresh.xlsx')
    

    相关文章

      网友评论

          本文标题:[Python与数据分析]-10IF

          本文链接:https://www.haomeiwen.com/subject/sejrrktx.html