美文网首页数据分析Python
爬虫学习(二)数据解析

爬虫学习(二)数据解析

作者: 拼了老命在学习 | 来源:发表于2020-07-09 22:28 被阅读0次

    1.xpath语法

    xpath语法

    2.1用lxml库解析html字符串和文件

    from lxml import etree
    #解析HTML字符串
    html = etree.HTML(text)
    print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
    #解析HTML文件
    html = etree.HTML("lagou.html")
    print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
    #解析HTML文件错误时(默认为xml解析器)需创建指定的解析器
    parser = etree.HTMLParser(encoding = 'utf-8')
    html = etree.HTML("lagou.html",parser=parser)
    print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
    

    2.2xpath和lxml库配合使用

    from lxml import etree
    parser = etree.HTMLParser(encoding = "utf-8")
    html = etree.parse('tencet.html',parser = parser)
    #1.获取所有tr标签
    trs = html.xpath("//tr") #xpath返回的是个列表
    for tr in trs:
        print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
    #2.获取第二个tr标签
    tr = html.xpath("//tr[2]").[0]
    print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
    #3.获取所有class = even得tr标签
    tr = html.xpath("//tr[@clss = 'even']")
    #4.获取所有a标签的href属性
    alist = html.xpath("//a/@herf")
    #5.获取某个标签下的文本文档
    title = tr.xpath(".//td[1]//text()")
    

    示例 电影天堂爬取

    import requests
    from lxml import etree
    Base_DOMAIN = 'https://www.dytt8.net/'
    # url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
        "Referer": "https://www.dytt8.net/html/gndy/dyzz/list_23_2.html"}
    
    
    def get_url(url):
        resp = requests.get(url, headers=HEADERS)
        # text = resp.content.decode('gbk', errors='ignore')
        text = resp.text
        html = etree.HTML(text)
        links = html.xpath("//table[@class='tbspan']//a/@href")
        urls = map(lambda url: Base_DOMAIN + url, links)
        return urls
    
    
    def jx(url):
        movies = {}
        resp = requests.get(url, headers=HEADERS)
        text = resp.content.decode('gbk', errors='ignore')
        html = etree.HTML(text)
        movie_title = html.xpath(
            "//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        movies["title"] = movie_title
        Zoom = html.xpath("//div[@id='Zoom']")[0]
        photos = Zoom.xpath(".//img/@src")
        haibao = photos[0]
        movies["haibao"] = haibao
        infos = Zoom.xpath(".//text()")
        for index, info in enumerate(
                infos):  # enumerate(infos)返回两个值,下标和内容,获取演员列表需要下标
            if info.startswith(
                    "◎年  代"):  # startswith("text")查找以text为开头的部分,并返回text整体
                # info = info.replace("◎年  代","").strip()
                # #replace()将text整体中text部分替换为空,即去除text部分,strip()去掉内容前后空格
                year = info_1(info, "◎年  代")
                movies["years"] = year
            elif info.startswith("◎豆瓣评分"):
                # info = info.replace("◎豆瓣评分","").strip()
                scores = info_1(info, "◎豆瓣评分")
                movies["scores"] = scores
            elif info.startswith("◎主  演"):
                info = info_1(info, "◎主  演")
                actor = [info]
                for x in range(index + 1, len(infos)):
                    actors = infos[x].strip()
                    if actors.startswith("◎"):
                        break
                    actor.append(actors)
                movies["actors"] = actor
            elif info.startswith("◎简  介"):
                info = info_1(info, "◎简  介")
                for x in range(index + 1, len(infos)):
                    profile = infos[x].strip()
                    if profile.startswith("◎"):
                        break
                    movies["profile"] = profile
        download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
        movies["download_url"] = download_url
        return movies
    
    
    def info_1(info, rule):
        return info.replace(rule, "").strip()
    
    
    def spider():
        base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'  # 预留页码位置
        film = []
        for x in range(1, 2):
            url = base_url.format(x)  # 填入页码位置获得完整链接
            films_details = get_url(url)
            for page_url in films_details:
                movie = jx(page_url)
                film.append(movie)
                print(film)
        # with open("E:/桌面/电影.txt","w")as f:
        #     for x in film:
        #         f.write("\n"+str(x))
    
    
    
    if __name__ == '__main__':
        spider()
    

    3.BeautifulSoup4库

    BeautifulSoup也是HTML/XML的解析器,主要用于解析和提取HTML/XML。它作用于HTML DOM,会载入整个文档,而xpath只是局部遍历,因此BeautifulSoup性能上低于xpath,但它解析HTML比xpath简单
    安装方法:pip安装

    pip install bs4
    

    基本使用

    from bs4 import BeautifulSoup
    bs = BeautifulSoup(html,'lxml')#将HTML导入用lxml解析器进行解析
    #1.获取所有tr标签
    trs = soup.find_all('tr')
    #2.获取第二个tr标签
    tr_2 = soup.find_all('tr',limit=2)[1] #limit限制获取几个数据,find_all返回列表
    #3.获取所有class=even的标签
    tr_even = soup.find_all('tr',class_='even')
    tr_even = soup.find_all('tr',attrs={'class':'even'})#atrrs可指定获取tr的某些属性
    #4.获取id=test,class=test的a标签
    alist = soup.find_all('a',id='test',class_='test')
    alist = soup.find_all('a',attrs={'id':'test','class':'test'})
    #5.获取a标签下的href属性
    alist = soup.find_all('a')
    for a in alist:
        #方法1
        href = a['href']
        #方法2
        href = a.attrs('href')
    #6.获取所有文本
    tr_3 = soup.find_all('tr')[1:]#过滤第一个
    for tr in tr_3:
        #infos = tr.strings #用strings会包括“\n”等字符,string会返回字符串,get_text()返回的不是列表
        infos = tr.stripped_strings#获取非空字符
        infos =list(infos) #转换为列表可提取其中元素
    

    CSS选择器 select

    #1.通过标签名查找
    print(soup.select('a'))
    #2.通过类名查找,如查找class=sy
    print(soup.select('.sy'))
    #3.通过id查找
    print(soup.select('#sy'))
    #4.组合查找 标签+id/class等
    print(soup.select('p #sy'))
    #5.通过属性查找
    print(soup.select("a[href='http://......']"))
    

    实例

    #BeautifulSoup实例及数据可视化简单应用
    import requests
    from bs4 import BeautifulSoup
    from pyecharts.charts import Bar #数据可视化库,版本1.7.1,新版本改动
    from pyecharts import options as opts
    
    weather = []
    def page_parse(url):
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        }
        resp = requests.get(url,headers=headers).content.decode('utf-8')
        soup = BeautifulSoup(resp,'html5lib') #港澳台地区需要用html5lib进行解析
        conmidtabs = soup.find('div',class_='conMidtab')
        tables = conmidtabs.find_all('table')
        for table in tables:
            trs = table.find_all('tr')[2:]
            for index,tr in enumerate(trs):
                tds = tr.find_all('td')
                city_td = tds[0]
                if index == 0:
                    city_td = tds[1]
                city = list(city_td.stripped_strings)[0]
                temp_td = tds[-2]
                temp = list(temp_td.stripped_strings)[0]
                # print({'city':city,'min-temp':temp})
                weather.append({'city':city,'min_temp':int(temp)})
        # with open('E:/桌面/weather.txt','w')as fp:
        #     for x in weather:
        #         fp.write('\n'+str(x))
    
    
    def main():
        urls = [
            'http://www.weather.com.cn/textFC/hb.shtml',
            'http://www.weather.com.cn/textFC/db.shtml',
            'http://www.weather.com.cn/textFC/hd.shtml',
            'http://www.weather.com.cn/textFC/hz.shtml',
            'http://www.weather.com.cn/textFC/hn.shtml',
            'http://www.weather.com.cn/textFC/xb.shtml',
            'http://www.weather.com.cn/textFC/xn.shtml',
            'http://www.weather.com.cn/textFC/gat.shtml'
        ]
        for url in urls:
            page_parse(url)
        weather.sort(key=lambda weather:weather['min_temp'])
        data = weather[0:10]
        # print(data)
        cities = list(map(lambda x:x['city'],data))
        min_temps = list(map(lambda x:x['min_temp'],data))
        chart = Bar() #创建一个直方图
        chart.set_global_opts(title_opts=opts.TitleOpts(title="天气预报")) #创建直方图主标题
        chart.add_xaxis(cities)
        chart.add_yaxis('',min_temps)
        chart.set_global_opts(xaxis_opts=opts.AxisOpts(name='城市')) #建立x轴图标
        chart.set_global_opts(yaxis_opts=opts.AxisOpts(name='温度'))
        chart.render('E:/桌面/天气.html')
    
    
    
    if __name__ == '__main__':
        main()
    

    4.正则表达式

    基本知识


    匹配单个字符.jpg
    匹配多个字符.jpg

    正则表达式常用小案例

    import re
    #1.匹配电话号码
    text = '13691612426'
    ret = re.match('1[34578]\d{9}',text)
    print(ret.group())
    #2.匹配邮箱
    text = '1871759153@qq.com'
    ret = re.match('\w+@[a-z0-9]+\.[a-z]+',text)
    print(ret.group())
    #3.匹配url
    text = 'https://www.runoob.com/python3/python3-tutorial.html'
    ret = re.match('(http|https|ftp)://[^\s]+',text)
    print(ret.group())
    #4.验证身份证
    text = '32042519121281241x'
    ret = re.match('\d{17}[\dxX]',text)
    print(ret.group())
    #5.匹配100内的数字
    text = '98'
    ret = re.match('[1-9]\d?$|100$',text)
    print(ret.group())
    

    group()分组

    import re
    #group分组
    text = 'apple prince is $5,iphone price is $300'
    ret = re.match('.*(\$\d+).*(\$\d+)',text)
    print(ret.group(1))
    print(ret.group(2))
    

    re模块常用函数

    import re
    # re常用函数
    # 1.findall() 找出所有满足条件的,返回的是一个列表
    text = 'apple prince is $5,iphone price is $300'
    ret = re.findall('\d+',text)
    print(ret)
    # 2.sub() 找出所有满足条件的并将其替换
    text = 'apple prince is $5,iphone price is $300'
    ret = re.sub('\d+','0',text)
    print(ret)
    # 3.split()函数,返回一个列表
    text = 'hello world ni hao'
    ret = re.split(' ',text)
    print(ret)
    # 4.compile() 对经常要用的正则表达式进行编译能提高效率
    text = 'the number is 20.50'
    r = re.compile(r"""
                    \d+   #小数点前面的
                    \.?   #小数点
                    \d+   #小数点后面的
                    """,re.VERBOSE)
    ret = re.search(r,text)
    print(ret.group())
    

    实例分析

    #古诗词网爬取(正则表达式的应用)
    import requests
    import re
    
    poems = []
    def page_parse(url):
        headers = {
            "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
        }
        resp = requests.get(url,headers=headers)
        text = resp.text
        titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #加入re.DOTALL使.*可以识别\n
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
        authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
        contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
        contents = []
        for content in contents_tags:
            content = re.sub('<.*?>','',content)
            contents.append(content.strip())
        #zip()函数简介
        #zip函数将两个或多个序列作为参数,返回一个组成元素为元组的列表,元组由各序列构成
        # x= [1,2,3]
        # y= [4,5,6]
        # xy = zip(x,y)
        # print(xy)
        #得到[(1,4),(2,5),(3,6)]
        # poems = []
        for x in zip(titles,dynasties,authors,contents):
            title,dynasty,author,content = x
            poem = {
                "title":title,
                "dynasty":dynasty,
                "author":author,
                "content":content
            }
            poems.append(poem)
        # for poem in poems:
        #     print(poem)
    
    
    
    def main1():
       base_url = 'https://www.gushiwen.cn/default_{}.aspx'
       for x in range(1,4):
           url = base_url.format(x) #确定前几页url
           page_parse(url)
    
    if __name__ == '__main__':
        main1()
        with open('E:/桌面/poems.txt','w')as fp:
            for poem in poems:
                fp.write("\n"+str(poem))
    

    相关文章

      网友评论

        本文标题:爬虫学习(二)数据解析

        本文链接:https://www.haomeiwen.com/subject/xifhqktx.html