美文网首页
整理了一下爬虫小程序

整理了一下爬虫小程序

作者: dec0de丶bcd385丶 | 来源:发表于2018-09-18 18:35 被阅读0次
    # -*- coding: utf-8 -*-
    #!/usr/bin/env python
    """
    Created on Sat Nov  5 10:50:13 2016
    @author: hello
    """
    
    import sys
    import os
    import re
    import codecs
    import copy
    import math
    import urllib
    import urllib.request
    import time,datetime
    import operator
    import zipfile
    import shutil
    import glob
    import requests
    import sqlite3
    import pandas as pd
    from multiprocessing import Pool as mpp
    from multiprocessing.dummy import Pool as tpp
    from math import log1p as ln
    from matplotlib import pyplot
    from pandas.tseries.offsets import *
    from bs4 import BeautifulSoup
    from shuju import all_list
    from shuju import url_tou
    from shuju import url_dict
    from shuju import tou
    from shuju import wei
    
    
    
    def read_to_list(readonce=None):
        readonce =re.sub(r'\{\}','0',readonce)#eval函数需要对应的
        readonce =re.sub(r'null','0',readonce)#eval函数需要对应的null识别成变量
        findok =re.search(r'(?<=\[).*(?=\])',readonce)#【】需要转义
        pattern =re.compile(r'{.*?}')
        listlist=pattern.findall(findok.group())
        '''aa=[]
        for i in listlist:
            print (i)
            aa.append(eval(i))'''
        aa= [eval(x) for x in listlist]
        #aa= sorted(aa, key=lambda k: k['userName'][:1])#不能按照中文名字排序
        print ([x['userName'] for x in aa])
        return aa
    
    def pr_url(url_str=None):
        return url_tou + url_str
    
    
    def main():
    
        my_headers = {
            'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding' : 'gzip',
            'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4'
        }
        sss     = requests.Session()
        r       = sss.get(pr_url(url_dict['yy']), headers = my_headers)
    
    
        r       = sss.post(pr_url(url_dict['xy']), headers = my_headers)
    
        a_str=r.text
        xa_str=a_str[a_str.find('var userGridData='):a_str.find('var userGridGridLoadUrl')]
        list_O = read_to_list(xa_str)
        #xingming = [x['userName'] for x in aa]
        xueyuan_list=[['学员', x['userName']+'_'+x['subjectName'], pr_url(url_dict['dl']) +x['loginName']+'&password='+x['password']] for x in list_O]
        print (len(xueyuan_list))
    
    
        r       = sss.post(pr_url(url_dict['ls']), headers = my_headers)
    
        a_str=r.text
        xa_str=a_str[a_str.find('var userGridData='):a_str.find('var userGridGridLoadUrl')]
        list_O = read_to_list(xa_str)
        #xingming = [x['userName'] for x in aa]
        daijiao_list=[['老师', x['userName']+'_'+x['departmentName'],  pr_url(url_dict['dl']) +x['loginName']+'&password='+x['password']] for x in list_O]
        print (len(daijiao_list))
    
    
        r       = sss.post(pr_url(url_dict['jm']), headers = my_headers)
    
        a_str=r.text
        xa_str=a_str[a_str.find('var userGridData='):a_str.find('var userGridGridLoadUrl')]
        list_O = read_to_list(xa_str)
        #xingming = [x['userName'] for x in aa]
        jiaomi_list=[['敎秘', x['userName']+'_'+x['departmentName'], pr_url(url_dict['dl']) +x['loginName']+'&password='+x['password']] for x in list_O]
        print (len(jiaomi_list))
    
    
        r       = sss.post(pr_url(url_dict['zr']), headers = my_headers)
    
        a_str=r.text
        xa_str=a_str[a_str.find('var userGridData='):a_str.find('var userGridGridLoadUrl')]
        list_O = read_to_list(xa_str)
        #xingming = [x['userName'] for x in aa]
        zhuren_list=[['主任',x['userName']+'_'+x['departmentName'], pr_url(url_dict['dl']) +x['loginName']+'&password='+x['password']] for x in list_O]
        print (len(zhuren_list))
        all_list.extend(xueyuan_list)
        all_list.extend(daijiao_list)
        all_list.extend(jiaomi_list)
        all_list.extend(zhuren_list)
        zhongjian = [str(index+1) + ",'"+x[0]+"','" +x[1] +"','" +x[2] +"'" for index,x in enumerate(all_list)]
        zhongzhi = '),('.join(zhongjian)
    
    
    
        with codecs.open('zyys.sql','w','utf-8') as f2:
            f2.write(tou + zhongzhi + wei)
    
        print ('完成')
    
    
    main()
    
    

    今天更新这个程序的目的是,第一,把程序中有特点的东西给隐藏起来,这样就能把我程序上传到网上,github上面,这样容易控制版本,更新内容,还不会被人猜出来干什么用的。
    第二,也是熟悉一下import函数,from和import函数搭配在一起的用法。from模块,import具体的部分内容,或者用来导入全部的内容,分别导入会麻烦,但让整个程序容易阅读,不然直接用导入全部内容,会让编程的时候,纠错会误报直接引用的函数名或变量名,发现满屏幕的叹号,找不出是真的有错还是因为import导致的错误,对于阅读程序来说,眼烦头痛。
    第三,把程序拆成两部分,有点钥匙和锁头的感觉呢,以后也应该把数据部分和程序部分分开,有时候会想隐藏些内容,所以想尝试这种方法。同时也能占github地便宜了呀,开心。

    相关文章

      网友评论

          本文标题:整理了一下爬虫小程序

          本文链接:https://www.haomeiwen.com/subject/grjmnftx.html