美文网首页
爬虫常用代码

爬虫常用代码

作者: 探索1者 | 来源:发表于2019-01-02 19:35 被阅读0次
    xpath抓取指定贴吧所有的图片
    # 思路
    # 1.获取贴吧主页URL,下一页,找URL规律
    # 2.获取 1 页中每个帖子的URL
    # 3.对每个帖子 URL 发请求,获取帖子中 图片URL
    # 4.以此对图片URL发请求,以 wb 方式保存到本地
    
    # 帖子链接链表 = parseHtml.xpath('..')
    # for 1个帖子链接 in 帖子链接列表:
    #   html = 对每个帖子发请求得到响应
    #   for 1个图片链接 in 图片链接列表
    #       with open('ll.jpg', 'wb') as f:
    #           f.write()
    
    # //div[@class="t_con cleafix"]/div/div/div/a/@href
    # //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src
    
    # kw=%E6%A0%A1%E8%8A%B1&pn=100 
    
    from lxml import etree
    import requests
    import urllib.parse
    
    class BaiduImgSpider:
        def __init__(self):
            self.baseurl = 'http://tieba.baidu.com'
            self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
            self.mainurl = 'http://tieba.baidu.com/f?'
        
        # 获取所有帖子的 URL 列表
        def getPageUrl(self, params):
            # 发请求
            res = requests.get(self.mainurl,params=params,headers=self.headers)
            res.encoding = 'utf-8'
            html = res.text
            
            # 提取页面中的href
            parseHtml = etree.HTML(html)
            tList = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
            
            for t in tList:
                tLink = self.baseurl + t
                self.getImgUrl(tLink)        
        
        # 获取 一个 帖子中所有图片的URL列表
        def getImgUrl(self, tLink):
            # 获取一个贴子的响应内容
            res = requests.get(tLink,headers=self.headers)
            res.encoding = 'utf-8'
            html = res.text
            
            # 提取图片的 src
            parseHtml = etree.HTML(html)
            imgList = parseHtml.xpath('//div[@class="video_src_wrapper"]/embed/@data-video | //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src')
            
            # 依次遍历图片链接调用写入函数
            for img in imgList:
                self.writeImage(img)
        
        # 把图片保存到本地
        def writeImage(self, img):
            # 对图片链接发起请求,获取res.content
            res = requests.get(img, headers=self.headers)
            res.encoding = 'utf-8'
            # 二进制文件
            html = res.content
            # 写入本地文件
            filename = img[-12:]
            with open(filename, 'wb') as f:
                f.write(html)
                print("%s下载成功" % filename)
        
        # 主函数
        def workOn(self):
            name = input('输入要爬取的贴吧名称:')
            begin = int(input("起始页:"))
            end = int(input("终止页:"))
            for n in range(begin, end+1):
                pn = (n-1) * 50
                params = {
                        "kw": name,
                        "pn": pn
                    }
                self.getPageUrl(params)
    #            params = urllib.parse.urlencode(params)
    #            # 拼接URL
    #            url = self.baseurl + "/f?" + params
    #            self.getPageUrl(url)
        
    if __name__ == "__main__":
        spider = BaiduImgSpider()
        spider.workOn()
    
    
    xpath爬取糗事百科的文字
    import requests
    from lxml import etree
    import pymongo
    
    class QiushSpider:
        def __init__(self):
            self.url = "https://www.qiushibaike.com/text/"
            self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
            # 连接对象
            self.conn = pymongo.MongoClient("10.8.20.56",27017)
            # 库对象
            self.db = self.conn["Qiushidb"]
            # 集合对象
            self.myset = self.db["zhuanye"]
        
        # 获取页面
        def getPage(self):
            res = requests.get(self.url, headers=self.headers)
            res.encoding = 'utf-8'
            html = res.text
            
            self.parsePage(html)
        
        # 解析并写入数据库
        def parsePage(self, html):
            # 创建解析对象,也是 节点对象
            parseHtml = etree.HTML(html)
            # 利用解析对象调用xpath,每个段子的对象
            baseList = parseHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
            # 遍历每个段子对象,1个1个提取
            for base in baseList:
                # base: <element at ...> 节点对象
                # 用户昵称
                username = base.xpath('./div/a/h2')
                if username:
                    username = username[0].text
                else:
                    username = '匿名用户'
                    
                # 段子内容
                content = base.xpath('./a/div[@class="content"]/span/text()')
                content = "".join(content).strip()
                # 好笑的数量
                laugh = base.xpath('.//i[@class="number"]')[0].text
                # 评论的数量
                comments = base.xpath('.//i[@class="number"]')[1].text
                
                # 存入 mongo 数据库,先定义成字典
                d = {
                    "username": username.strip(),    
                    "content": content.strip(),
                    "laugh": laugh.strip(),
                    "comments": comments.strip()
                    }
                self.myset.insert_one(d)        
        
        # 主函数
        def workOn(self):
            print('正在爬取中...')
            self.getPage()
            print('爬取结束,存入Qiushidb库')
        
    if __name__ == '__main__':
        spider = QiushSpider()
        spider.workOn()
        
    
    requests模块案例 : 豆瓣电影排行榜数据抓取

    抓取目标 : 豆瓣电影 - 排行榜 - 剧情电影名称 评分

    import requests
    import json
    import pymysql
    
    class DoubanSpider:
        def __init__(self):
            self.url = "https://movie.douban.com/j/chart/top_list?"
            self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
            
            self.db = pymysql.connect("10.8.20.56","kk","123456","spiderdb",charset="utf8")
            self.cursor = self.db.cursor()
        
        # 获取页面
        def getPage(self, params):
            res = requests.get(self.url,params=params,headers=self.headers)
            res.encoding = "utf-8"
            html = res.text
    #        print(html)
            # html 为[{1个电影信息},{},{}]
            self.parsePage(html)
        
        # 解析页面
        def parsePage(self, html):
            ins = "insert into douban(name,score) values(%s,%s)"        
            rList = json.loads(html)
            for rDict in rList:
                name = rDict["title"]
                score = rDict["score"]
                L = [name.strip(),float(score.strip())]
                print(L)
    #            print(name, score)
                self.cursor.execute(ins,L)
                self.db.commit()
                print("插入成功")
            self.cursor.close()
            self.db.close()
                    
        # 主函数
        def workOn(self):
            number = input("请输入数量:")
            params = {
                    "type": 11,
                    "interval_id": "100:90",
                    "action": "",
                    "start": "0",
                    "limit": number
                    }
            self.getPage(params)
        
    if __name__ == '__main__':
        spider = DoubanSpider()
        spider.workOn()
    
    selenium+chromedriver京东商品信息爬取
    from selenium import webdriver
    import time 
    
    # 创建浏览器对象
    driver = webdriver.Chrome()
    # 访问京东首页
    driver.get('https://www.jd.com/')
    # 找到搜索框按钮,接收终端输入,发送到搜索框
    text = driver.find_element_by_class_name('text')
    key = input("请输入要搜索的内容:")
    text.send_keys(key)
    # 点击 搜索按钮
    button = driver.find_element_by_class_name('button')
    button.click()
    time.sleep(2)
    
    while True:
        # 执行脚本,进度条拉到最底部
        driver.execute_script(
         'window.scrollTo(0,document.body.\
        scrollHeight)')
        time.sleep(3)
        # 提取数据,分析数据
        rList = driver.find_elements_by_xpath(
                   '//div[@id="J_goodsList"]//li')
        # rList : ['商品1节点对象','商品2节点对象']
        for r in rList:
            contentList = r.text.split('\n')
            price = contentList[0]
            name = contentList[1]
            commit = contentList[2]
            market = contentList[3]
            
            d = {
                    "价格":price,
                    "名称":name,
                    "评论":commit,
                    "商家":market,
                    }
            with open("jd.json","a",encoding="utf-8") as f:
                f.write(str(d) + '\n')
            
        # 点击下一页,-1表示没找到
        if driver.page_source.find(
                'pn-next disabled') == -1:
            driver.find_element_by_class_name\
                       ('pn-next').click()
            time.sleep(3)
        else:
            print("爬取结束")
            break
              
    #下一页能点 :  pn-next
    #下一页不能点: pn-next disabled
    # 关闭浏览器
    driver.quit()
    

    相关文章

      网友评论

          本文标题:爬虫常用代码

          本文链接:https://www.haomeiwen.com/subject/qqmvlqtx.html