美文网首页
python第三天(三) Lxml爬虫

python第三天(三) Lxml爬虫

作者: 敏姐姐_e9d0 | 来源:发表于2018-10-30 20:23 被阅读0次

糗事百科升级版

我们之前只是简单爬取了用户的用户名,性别,段子内容还有笑脸数、评论数,今天在原有基础上爬取用户的详细信息,进入用户的主页,爬取数据。

import requests
from lxml import etree
import csv
import time

fp1 = open('qiu_1.csv','w+',encoding='utf-8',newline='')
writer1 = csv.writer(fp1)
writer1.writerow(['id','sex','age','laugh','comment','user_url','content'])

fp2 = open('qiu_2.csv','w+',encoding='utf-8',newline='')
writer2 = csv.writer(fp2)
writer2.writerow(['fans','topic','qiushi','comment_1','favour','handpick','martial_status','constellation','profession','home','qiushi_age'])

header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}

def get_info(url):
    html = requests.get(url,headers=header)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//div[@class="col1"]/div')
    base_url = 'https://www.qiushibaike.com'
    for info in infos:
        id = info.xpath('div[1]/a[2]/h2/text()')[0] if len(info.xpath('div[1]/a[2]/h2/text()'))==1 else '匿名用户'
        jug_sex = info.xpath('div[1]/div/@class')
        if len(jug_sex)==0:
            sex = '不详'
            age = '不详'
        elif jug_sex[0]=='articleGender manIcon':
            sex = '男'
            age = info.xpath('div[1]/div/text()')[0]
        else:
            sex = '女'
            age = info.xpath('div[1]/div/text()')[0]
        content = info.xpath('a[1]/div/span[1]/text()')[0]
        laugh = info.xpath('div[2]/span[1]/i/text()')[0]
        comment = info.xpath('div[2]/span[2]/a/i/text()')[0] if info.xpath('div[2]/span[2]/a/i/text()') else None
        user_url = base_url + info.xpath('div[1]/a[2]/@href')[0] if info.xpath('div[1]/a[2]/@href') else None
        writer1.writerow([id, sex, age, laugh, comment, user_url, content])
        if user_url == None:
            pass
        else:
            get_user_info(user_url)
    time.sleep(1)

def get_user_info(url):
    html = requests.get(url,headers=header)
    selector = etree.HTML(html.text)
    if selector.xpath('//div[@class="user-block user-setting clearfix"]'):
        pass
    else:
        fans = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[1]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[1]/text()') else None
        topic = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[2]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[2]/text()') else None
        qiushi = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[3]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[3]/text()') else None
        comment_1 = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[4]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[4]/text()') else None
        favour = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[5]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[5]/text()') else None
        handpick = selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[6]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[1]/ul/li[6]/text()') else None
        martial_status = selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[1]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[1]/text()') else '不详'
        constellation = selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[2]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[2]/text()') else '不详'
        profession = selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[3]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[3]/text()') else '不详'
        home = selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[4]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[4]/text()') else '不详'
        qiushi_age = selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[5]/text()')[0] if selector.xpath('//div[@class="user-col-left"]/div[2]/ul/li[5]/text()') else '不详'
        print(fans,topic,qiushi,comment_1,favour,handpick,martial_status,constellation,profession,home,qiushi_age)
        writer2.writerow([fans,topic,qiushi,comment_1,favour,handpick,martial_status,constellation,profession,home,qiushi_age])


if __name__ == '__main__':
    urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,36)]
    for url in urls:
        get_info(url)

相关文章

网友评论

      本文标题:python第三天(三) Lxml爬虫

      本文链接:https://www.haomeiwen.com/subject/lndntqtx.html