Python爬虫(Python3.6)

作者: Ucan先生 | 来源:发表于2018-05-05 17:57 被阅读0次

python3.6安装scrapy框架
Python爬虫(Python3.6)
51job大数据职位爬虫示例
爬虫学习(一)：利用requests爬取猫眼电影top100
python 中安装lxml包出现的问题
音视频环境安装
Python学习资料
CentOS 6.3编译安装Python3.6.3
Ubuntu python 虚拟环境 pycharm各种配置实录
Mac QGis 安装使用

import urllib.request
import urllib.error
import os
import re
import imageio
capterId = 5301
sectionId = 1
dir = 'C:/Users/zybang/Desktop/gaoshu'
url = "http://netedu.xauat.edu.cn/jpkc/netedu/jpkc/gdsx/homepage/5jxsd/51/513/"
pattern = re.compile('<img.*?src="(.*?/.*?.gif)"')
while capterId < 5313:
    url = url + str(capterId)+"/"
    while sectionId < 20:
        if sectionId<10:
            strSectionId = str(capterId)+str(0)+str(sectionId)
        else:
            strSectionId = str(capterId)+str(sectionId)
        requestUrl = url+strSectionId+'.htm'
        try:
            response = urllib.request.urlopen(requestUrl)
        except urllib.error.HTTPError as e :
            print(requestUrl)
            print(e.code)
            continue
        data = response.read()
        data1 = data.decode('gbk')
        data2 = str(data)
        titlePattern = re.compile('<title>(.*?)</title>')
        images = pattern.findall(data2)
        title = titlePattern.findall(data1)
        title1 = title[0]
        f = open(dir+'/'+title1+'.htm','wb')
        f.write(data)
        for image in images:
            imageUrl = url+image
            try:
                imgResponse = urllib.request.urlopen(imageUrl)
            except urllib.error.URLError as e :
                print(imageUrl)
                print(e.reason)
                continue
            imgBytes = imgResponse.read()
            pathpatt = re.compile('/')
            path = pathpatt.split(image)
            imgDir = dir+"/"+path[0]
            if not os.path.exists(imgDir):
                os.makedirs(imgDir)
            imgFile = open(dir+"/"+image,"wb")
            imgFile.write(imgBytes)
            sectionId += 1
    capterId +=1