美文网首页
使用XPath爬取起点网

使用XPath爬取起点网

作者: IT的咸鱼 | 来源:发表于2018-10-21 19:05 被阅读0次
    使用XPath简单爬取起点网并将数据存入MySQL数据库
    import requests#pip3 install requests
    from lxml import etree#pip3 install lxml
    import time
    import re
    import pymysql#pip3 install PyMySQL
    
    def db():
        #连接数据库
        conn = pymysql.connect(host='localhost',port=3306,user='root',database='数据库名',password='数据库密码',charset='utf8')
        print('连接上数据库了')
        return conn
    def insertdb0(conn,data0):
        #创建光标
        cursor = conn.cursor()
        #SQL语句
        sql = """
            INSERT INTO qidian1(%s) VALUE(%s)
        """%(','.join([k for k,v in data0.items()]),
        ','.join(['%s' for k,v in data0.items()]),
        )
        # print([v for k,v in data0.items()])
        # print([k for k,v in data0.items()])
        try:
            cursor.execute(sql,[v for k,v in data0.items()])
            conn.commit()
        except:
            print('失败')
            conn.rollback()
    
    def insertdb1(conn,data1):
        cursor = conn.cursor()
        sql = """
            INSERT INTO info_parse(%s) VALUE(%s)
        """%(','.join([k for k,v in data1.items()]),
        ','.join(['%s' for k,v in data1.items()]),
        )
        try:
            cursor.execute(sql,[v for k,v in data1.items()])
            conn.commit()
        except:
            print('失败')
            conn.rollback()
    
    def insertdb2(conn,data2):
        cursor = conn.cursor()
        sql = """
            INSERT INTO content_parse(%s) VALUE(%s)
        """%(','.join([k for k,v in data2.items()]),
        ','.join(['%s' for k,v in data2.items()]),
        )
        try:
            cursor.execute(sql,[v for k,v in data2.items()])
            conn.commit()
        except:
            print('失败')
            conn.rollback()
    
    #内容页面的URL
    def content_url(ctx1):
        for k,v in ctx1.items():
            # print(v)
            src = 'https:{}'.format(v)
            # print(src)
            # time.sleep(5)
            content_parse(src)
    #使用正则解析内容页面
    def content_parse(src):
        # print(src)
        req_headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Host': 'read.qidian.com',
        }
        response = requests.get(src,headers=req_headers)
        # print(response.text)
        #匹配内容
        content1 = re.compile(r'.*?<div.*?class="read-content j_readContent">(.*?)</div>.*?',re.S)
        sult = re.findall(content1,response.text)[0]
        # print(sult)
        #替换内容的\n
        content2 = re.sub(r"<[^>]*>|&nbsp;|\n","",sult)
        # print(content2)
        data2 = {
            'content2':content2
        }
        conn = db()
        insertdb2(conn,data2)
    
        
    
    #信息页URL
    def info_url(ctx):
        # print(ctx)
        for k,v in ctx.items():
            # print(v)
            src = 'https:{}'.format(v)
            # print(src)
            info_parse(src)
    #信息页URL解析,XPath提取数据
    def info_parse(src):
        # print(src)
        req_headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Host': 'book.qidian.com',
        }
        response = requests.get(src,headers=req_headers)
        # print(response.text)
        xml2 = etree.HTML(response.text)
        # print(xml2)
        zong = xml2.xpath('//div[@class="volume-wrap"][1]/div[@class="volume"]/ul[@class="cf"]/li')
        # print(len(zong))
        for i in zong:
            href = i.xpath('./a/@href')[0]
            # print(href)
            data1 = {
                'href':href
            }
            conn = db()
            insertdb1(conn,data1)
            time.sleep(2)
            ctx1 = {
                'href':href
            }
            
            content_url(ctx1)
            
    
    #首页页面,使用XPath提取数据
    def qidian_index():
        url = 'https://www.qidian.com/all'
        req_headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
        }
        response = requests.get(url,headers=req_headers)
        # print(response.text)
        print(response.url)
        xml1 = etree.HTML(response.text)
        # print(xml1)
        #图片封面
        img = xml1.xpath('//div[@class="book-img-box"]/a/img/@src')
        # print(img)
        zong = xml1.xpath('//div[@class="book-mid-info"]')
        for i in zong:
            
            #标题
            title = i.xpath('./h4/a/text()')[0]
            # print(title)
            #作者
            zuozhe = i.xpath('./p/a[1]/text()')[0]
            # print(zuozhe)
            fenlei1 = i.xpath('./p/a[2]/text()')[0]
            # print(fenlei)
            fenlei2 = i.xpath('./p/a[3]/text()')[0]
            # print(fenlei2)
            #分类
            # fenlei = fenlei1 + '.' + fenlei2
            fenlei = '{}.{}'.format(fenlei1,fenlei2)
            # print(fenlei)
            #内容
            content = i.xpath('./p/text()')[0].strip()
            # print('*'*100)
            # print(content)
            src_cover = i.xpath('./h4/a/@href')[0]
            # print(src_cover)
            
            data0 = {
                'title':title,
                'zuozhe':zuozhe,
                'fenlei':fenlei,
                'content':content,
            }
            conn = db()
            insertdb0(conn,data0)
            # time.sleep(2)
            ctx = {
                'src_cover':src_cover
            }
            
            info_url(ctx)
    
    
    
    if __name__ == '__main__':
        qidian_index()
    

    相关文章

      网友评论

          本文标题:使用XPath爬取起点网

          本文链接:https://www.haomeiwen.com/subject/ayiyzftx.html