美文网首页
爬虫案例

爬虫案例

作者: 开心的小哈 | 来源:发表于2022-07-03 18:56 被阅读0次

    实战巩固

    1. 爬取搜狗指定页面的数据-待完成
    import requests
    
    heads = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"}
    
    
    def get_url():
        kw = input('请输入检索关键字')
        return "https://cn.bing.com/search?q=" + kw
    
    
    def get_data(url):
        res = requests.get(url, headers=heads)
        save_file("检索结果数据", res.text)
    
    
    def save_file(name, data):
        with open(name + ".html", "w", encoding="utf-8") as wf:
            wf.write(data)
    
    
    if __name__ == "__main__":
        url = get_url()
        get_data(url)
    
    
    1. 爬取破解百度翻译
    import requests
    import urllib.parse as parse
    import json
    
    def fanyi(kw):
        url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'REALTIME_TRANS_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_PREFER_SWITCH=1; APPGUIDE_10_0_2=1; BIDUPSID=6A0B90F549B4E722A96A29666574A81B; PSTM=1653639263; BAIDUID=6A0B90F549B4E722B61B0337963B4817:FG=1; BAIDUID_BFESS=6A0B90F549B4E722B61B0337963B4817:FG=1; ZFY=NmGJc7JlHfQ:BLiuJWcMARBy:BusCUodzUtsi4qGc2tfQ:C; BAIDU_WISE_UID=wapp_1655032828797_134; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1655299374,1655300476,1655300668,1655734579; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1655734579; ab_sr=1.0.1_OTFhOWFiNmI5NzQyMDY0OTQwZGIwMDE5OTRiM2M1Y2I3OTlmOTRhMWQ0MGRiMjMwYzU2MjJjOGUyYWZiYzJmNmYyYjU0MTE0ODU1MGI2NTdkOTI0OGFjMDlmYTg2NTBkODU5MmE0NWE3MzM1ZjE2OGVhNDY1MzRjNjhhMmQzNzZmNjAyZWQxYzI1ZDkwNjdlZjI3M2MzMDE4OWYzN2FkNQ==',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'
            }
        data = {'from': 'en', 'to': 'zh', 'query': kw, 'transtype': "realtime", 'simple_means_flag': "3",
                "sign": '830707.544706', 'token': '98cd61560d5388bcc7d0ff60c08c4158', 'domain': 'common'}
        # data = parse.urlencode(data).encode()
        # print(data)
        # data=f"from=en&to=zh&query={kw}&transtype=realtime&simple_means_flag=3&sign=830707.544706&token=98cd61560d5388bcc7d0ff60c08c4158&domain=common"
    
        print(data)
        res = requests.post(url, headers=headers,data=data)
        # print(res.text.encode('utf-8').decode("unicode_escape"))
        # page_text.content.decode('utf-8') # 将返回的内容进行utf-8编码
        # print(res.content.decode('unicode-escape'))  # 将unicode编码进行转码显示中文汉字
        list=res.json()
        print()
        fp=open("./11.txt","w",encoding="utf-8")
        json.dump(res.json(),fp=fp,ensure_ascii=False)
    
    kw = input('please input key')
    fanyi(kw)
    
    
    1. 爬取豆瓣电影分类排行榜
    import json
    import requests
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}
    
    def Test01():
        url = "https://movie.douban.com/j/chart/top_list"
        param = {'type': '24',
                 'interval_id': '100:90',
                 'action': '',
                 'start': '10',
                 'limit': '20'
                 }
    
        res=requests.get(url, params=param, headers=headers)
        list_data=res.json()
        print(list_data)
        fp=open('./douban.json',"w",encoding='utf-8')
        json.dump(list_data,fp=fp,ensure_ascii=False)
    
    def getKDJ():
        ke = input('请输入城市')
        a=1
        while True:
            url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
            data={'cname':"",'pid':'',"keyword":ke,"pageIndex":a}
            res=requests.post(url,headers=headers,data=data)
            print(res.text)
            data_list=json.dumps(res.text,ensure_ascii=False)
    
            print(type(data_list),data_list)
            a+=1
            commit=input("是否查看下一页数据,是进行/否推出")
            if commit!="是":
                return None
            else:
                res = requests.post(url, headers=headers, data=data)
                print(res.text)
                data_list = json.dumps(res.text, ensure_ascii=False)
                print(type(data_list), data_list)
    
    def getGJJG():
        url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?hKHnQfLv=53BHcvFf57UciNYtofCPsA9FO16baEIXUe19bRBXhg7rBlmtCHnsik_byn0PzLSY3bVUkihuxWe_sgkAX1WZu_1ybEqU5PPA8hR28JBI5590cYX5rUp16.UNrukVygjFWnB30adCTRLG8jFAp34jFBTtUzms3I0GZlZnxHGJd6HNNWBc_rsje99ao6.US098joA5m4._S2_rOpW2K4U5gu_ojQwSTPniQeOCJRMcaSNX2JDrrhKQNHKHt7Dm6iB_9St26DTwNP5.6TPTVnXNYAngMOkdQoWtp2ClluzSlM3yYvV4SEqFvBQW2JAyrd5ttfZc2rBIpwKA902YkpzXr60lQnJgQo6kbc4L7JK4P94l&8X7Yi61c=4w_361nsYEBznepRcSH0pxcubexKO5Vosw.LtgenPXT_Ik.uhuVjgKDrUG9OVVt97Oo9eEmGIUB9yUqnErd5hJqL1TUMki1bYTFHRHZoNTE5tDPeKYcioTBdHtBMEpkNu'
        data={'on':'true','page':1,"pageSize":15,"productName":"","conditionType":1,"applyname":""}
    
        res=requests.post(url,data=data,headers=headers)
        print(res.text)
    
    
    
    
    
    
    
    
    1. 爬取肯德基餐厅查询地址
    
    
    
    1. 爬取国家药品监管总局相关数据
    2. bs4的古诗文定位练习
    from bs4 import BeautifulSoup
    import requests
    
    def get_data():
        headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}
        url='https://www.shicimingju.com/book/sanguoyanyi.html'
        page_text=requests.get(url,headers=headers)
        # 在首页中解析出章节和详情页标题
        page_text=page_text.content.decode('utf-8')
        soup=BeautifulSoup(page_text,'lxml')
        # 解析章节标题和详情页的URL
        li_list=soup.select('div.book-mulu > ul > li')
        fp=open('./sango.txt','w',encoding='utf-8')
        for li in li_list:
            title=li.a.string
            content_url='https://www.shicimingju.com/'+li.a['href']
            # 对详情页发起请求,解析出章节内容来
            detail_page_text=requests.get(content_url,headers=headers).content.decode('utf-8')
            # 解析出详情页中相关的内容
            datail_soup=BeautifulSoup(detail_page_text,'lxml')
    
            div_tag=datail_soup.find('div',class_='chapter_content')
            content=div_tag.get_text()
            fp.write(title+':'+content+'\n')
            print(content,'爬取成功')
    
    get_data()
    
    
    

    Xpath练习1

    # Xpth 解析城市案例
    import requests
    from lxml import etree
    
    headers = {
        'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37'
        ,'Host':'www.aqistudy.cn'
    
    
    }
    
    filePath='D:\PyTest\XpathDemo1\ddd.html'
    def get_data():
        # url = 'http://www.aqistudy.cn/historydata/'
        # res = requests.get(url=url, headers=headers)
        # data = res.text
        #
        # tree = etree.HTML(data)
        #   使用本地的文件进行上传测试
        # with open('./ddd.html','r',encoding='utf-8') as wf:
        #    s= wf.read()
    
        tree = etree.parse(filePath, etree.HTMLParser())
        # tree = etree.HTML(filePath)
        host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
        all_ctty_names = []
        for li in host_li_list:
            name = li.xpath('./a/text()')[0]
            all_ctty_names.append(name)
    
        ctiy_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
        for li in ctiy_names_list:
            cty_name = li.xpath('./a/text()')[0]
            all_ctty_names.append(cty_name)
    
        print(all_ctty_names, len(all_ctty_names))
    
    def get_data2():
        tree = etree.parse(filePath, etree.HTMLParser())
        ctty_names=tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
        print(ctty_names,len(ctty_names))
    
    if __name__ == "__main__":
        get_data2()
    
    
    

    Xpath练习2

    1. 将免费的简历模板进行下载
    
    

    验证码打码
    识别古诗文网页登陆页面中的验证码地址:登录古诗文网 (gushiwen.cn)

    1. 将验证码图片进行本地下载
    2. 调用平台提供的代码进行图片数据识别

    多线程-线程池进行下载

    import requests
    from lxml import etree
    import re
    from multiprocessing.dummy import Pool
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44'}
    url = 'https://www.pearvideo.com/category_8'
    page_text = requests.get(url, headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
    urls_video=[]
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
        name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
        # 对详情页发起url的请求
        info_page = detail_url.split('_')[-1]
        inofo_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + info_page + '&mrd=0.20124002223369164'
        headers['Referer'] = detail_url
        detail_page_text = requests.get(url=inofo_url, headers=headers).json()
        video = detail_page_text.get('videoInfo').get('videos').get('srcUrl')
        d_video = 'cont-%s' % (info_page)
        lis = re.split('-|/', video)[6]
        # 获取视频地址
        video_path = video.replace(lis, d_video)
        dic={
            'name':name,
            'url':video_path
        }
        urls_video.append(dic)
    
    # 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
    def get_video_data(video_dic):
        url=video_dic['url']
        name = video_dic['name']
        name = re.sub('[\/:*?"<>|]', '-', name)  # 去掉非法字符
        print(name, '正在下载...')
        res=requests.get(url,headers=headers).content
        with open(name,'wb') as wf:
            wf.write(res)
            print(name,'下载成功!!!')
    
    pool=Pool(4)
    
    pool.map(get_video_data,urls_video)
    
    pool.close()
    pool.join()
    
    

    12306网站登录

    相关文章

      网友评论

          本文标题:爬虫案例

          本文链接:https://www.haomeiwen.com/subject/wzmzvrtx.html