美文网首页
文本分类挖掘预测

文本分类挖掘预测

作者: jackmanzhang | 来源:发表于2019-01-04 18:41 被阅读0次

    首先说明内容有些简单( (⊙o⊙),仅供参考)

    文本预测数据(由于数据太多再次测试1w条数据即test的数据集)

    数据集地址下载:

    https://pan.baidu.com/share/init?surl=XIZwRlG4-yynR9fSEAdRiA
    密码:kxxa

    首先将把需要测试的数据集暂时保存下来,进行分词,关键词提取,集合并集,变换特征向量等操作,把关键词和特征向量的内容保存在文本里;

    import jieba
    import jieba.analyse
    import numpy as np
    f_train='C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
    list_x=[]
    list_y=[]
    # while True:
    #     print('第一次请注释,以后请注释,运行last2')
    with open(f_train,'r',encoding='utf-8') as file_train:
        for i in file_train:
            line_list=i.split('\t')
            list_x.append(line_list[1])
            list_y.append(line_list[0])
            # print(i)
    # print(list_train)
    ###分词 存储分词
    ##+++++++++++++++++++++++++++++++++++++++++++++++
    
    for count,article in enumerate(list_x):
        segment_i=jieba.analyse.extract_tags(article,topK=10,withWeight=False,allowPOS=())
        list_x[count]=segment_i
    
    
        # if count>10:
        #     break
    print(list_x[0])
    # # list1.append(segment_i)
    # ##转换变为 1,0
    # ##所有训练集的并集(union)
    set_union={}
    print(type(set_union))
    count=0
    for i in list_x:
        count+=1
        print(count)
        set_union=set(set_union)|set(i)
    print(len(set_union))
    #计算并集(转化为词向量)
    list_set_union=list(set_union)
    print(list_set_union)
    with open('特征集合变换列表00.txt','w+') as filelist:
        filelist.write(str(list_set_union))
    
    
    
    # with open('特征集合变换列表.txt','r') as f:
    #     list_set_union=f.read()
    #     list_set_union=eval(list_set_union)
        # print(list_set_union)
        # print(len(list_set_union))
    
    print('*'*100)
    list_all=[]
    count=0
    with open('all00.txt', 'w+') as file:
        for x in list_x:
            count+=1
            print('count:',count)
            list_one=[0 for i in range(len(list_set_union))]
            for i in x:
                for k,v in enumerate(list_set_union):
                    if v==i:
                        list_one[k]=1
                        break
    
            file.write(str(list_one) + '\n')
    

    然后在第二个python文件里读取刚才保存的文件,如果写在一起每次都会重新生成比较慢,所以在此小编写了两个文件。便于操作。

    
    from sklearn.linear_model import LogisticRegression
    import jieba
    import jieba.analyse
    list_all=[]
    list_y=[]
    f_train='C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
    with open(f_train,'r',encoding='utf-8') as file_train:
        for i in file_train:
            line_list=i.split('\t')
            list_y.append(line_list[0])
    print(list_y)
    set_y=set(list_y)
    print(set_y)
    list_set_y=list(set_y)
    print(list_set_y)
    dict_set_y={}
    for k,v in enumerate(list_set_y):
        dict_set_y[k]=v
    for i,j in enumerate(list_y):
        for k,v in enumerate(list_set_y):
            if j==v:
                list_y[i]=k
                break
    print(list_y)
    ##列表的形式转换成字符串
    with open('all0.txt','r') as f:
        file=f.readlines()
        for k,i in enumerate(file):
            i=i.replace('\n','')
            i=eval(i)
            # print(k)
            list_all.append(i)
        # print(list_all)
        print(len(list_all))
    
    lr_model = LogisticRegression()
    lr_model.fit(list_all, list_y)
    with open('特征集合变换列表0.txt','r') as f:
        list_set_union=f.read()
        list_set_union=eval(list_set_union)
        # print(list_set_union)
        # print(len(list_set_union))
    while True:
        cheshi=input('测试:')
        segment_i=jieba.analyse.extract_tags(cheshi,topK=10,withWeight=False,allowPOS=())
        # print(segment_i)
    
        list_one = [0 for i in range(len(list_set_union))]
        for x in segment_i:
            for k,v in enumerate(list_set_union):
                if v==x:
                    list_one[k]=1
                    break
        # print(list_one)
        s=lr_model.predict([list_one])
        print(dict_set_y[s[0]])
    

    直到这里基本可以完成简单预测,下面进行一个简单的前后端界面交互。利用django进行交互,简单说明一下建项目的流程。

    image.png
    image.png
    image.png

    注意如果建立了static的包要在setting里,一般最后加上,没建立这个包就不用了其他的内容暂时不需要更改

    STATICFILES_DIRS = [
        os.path.join(BASE_DIR, 'static'),
    ]
    
    image.png
    image.png
    from django.conf.urls import url
    from . import views
    urlpatterns = [
        url(r'^$',views.index),
        url(r'^serach/$',views.serach),
    ]
    
    
    image.png
    from django.shortcuts import render
    from sklearn.linear_model import LogisticRegression
    from sklearn.utils.validation import check_array as check_arrays
    import jieba
    import time
    import jieba.analyse
    from django.shortcuts import render,HttpResponse,HttpResponseRedirect,redirect
    # Create your views here.
    def index(request):
        return render(request, 'index.html')
    def serach(request):
        cheshi=request.POST.get('cheshi')
        # print(content)
        mysession=request.session.get('mysession0','')
        list_all = []
        list_y = []
        start=time.time()
    
        f_train = 'C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/cnews/cnews.test.txt'
        with open(f_train, 'r', encoding='utf-8') as file_train:
            for i in file_train:
                line_list = i.split('\t')
                list_y.append(line_list[0])
        # print(list_y)
        set_y = set(list_y)
        print(set_y)
        list_set_y = list(set_y)
        print(list_set_y)
        dict_set_y = {}
        for k, v in enumerate(list_set_y):
            dict_set_y[k] = v
        for i, j in enumerate(list_y):
            for k, v in enumerate(list_set_y):
                if j == v:
                    list_y[i] = k
                    break
        # print(list_y)
    
        if mysession == '':
            ##列表的形式转换成字符串
            with open('C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/all0.txt', 'r') as f:
                file = f.readlines()
                for i in file:
                    i = i.replace('\n', '')
                    i = eval(i)
                    # print(i)
                    list_all.append(i)
                # print(list_all)
                # print(len(list_all))
    
    
    
            with open('C:/Users/Administrator/PycharmProjects/new/练习/第六月/other/特征集合变换列表0.txt', 'r') as f:
                list_set_union = f.read()
                list_set_union = eval(list_set_union)
                # print(list_set_union)
                # print(len(list_set_union))
                request.session['mysession0'] = list_all
                request.session['mysession1'] = list_set_union
        # print('*'*100)
        s0=time.time()
        list_all=request.session['mysession0']
        list_set_union=request.session['mysession1']
        s1=time.time()
        # print('session:',s1-s0)
        # print('*' * 100)
        s0=time.time()
        lr_model = LogisticRegression()
        lr_model.fit(list_all, list_y)
        s1 = time.time()
        print('逻辑:', s1 - s0)
        # print('*' * 100)
        # cheshi = input('测试:')
        segment_i = jieba.analyse.extract_tags(cheshi, topK=10, withWeight=False, allowPOS=())
        # print(segment_i)
    
        list_one = [0 for i in range(len(list_set_union))]
        for x in segment_i:
            for k, v in enumerate(list_set_union):
                if v == x:
                    list_one[k] = 1
                    break
        # print(list_one)
        s = lr_model.predict([list_one])
        answer=dict_set_y[s[0]]
        print(answer)
        end=time.time()
        print(end-start)
        ctx={
            'content':answer
        }
        return render(request, 'index.html',ctx)
    
    

    最后我们在模板templates的文件夹中编写简单前端程序。

    image.png
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Title</title>
        <script src="/static/js/jquery-1.12.4.min.js"></script>
    
    </head>
    <body>
    {#<img src="/static/img/1.jpg">#}
    <div style="text-align: center;margin-top: 100px">
    <form action="/serach/" method="post" >
        {% csrf_token %}
        <textarea cols="50%" rows="10" name="cheshi" id="tt"></textarea><br/>
        <input type="submit" id="submit"><br>
        <input type="text" value="{{ content }}" name="over">
    
    </form>
        <script>
            $("#submit").click(function () {
                if($("#tt").val()==''){
                    alert('不能发空')
                    return false
                }
    
            })
    
        </script>
    </div>
    
    </body>
    </html>
    

    到这里前后端交互基本可以实现了,测试一下,测试之前咱们先迁移一下,否则session无法存储

    image.png

    然后运行项目

    image.png

    运行之后在浏览器中输入 127.0.0:8000不出意外应该出现如下情况,其他意外自行百度解决,一般都是包不全,去安装好就好了。或者emmm...(此处省略n个字,请自行脑补 (⊙o⊙))

    image.png image.png image.png

    整体过程基本结束。如有问题请互相讨论留言,本内容由编者独创,仅供参考,如有雷同纯属巧合。

    相关文章

      网友评论

          本文标题:文本分类挖掘预测

          本文链接:https://www.haomeiwen.com/subject/cxukrqtx.html