美文网首页Python爬虫作业
交作业 爬美股吧

交作业 爬美股吧

作者: Snow__ | 来源:发表于2017-05-29 14:59 被阅读0次

    作业要求:
    东方财富网美股吧贴子数据 包含:浏览数、评论数 、帖子标题 、帖子内容 、回复人、 回复时间、 回复内容http://guba.eastmoney.com/list,meigu.html

    这个网站作为练习提升挺大的,有很多小细节要抠,花了好多时间,还是没搞完。目前能爬,但是有一些编码和最后数据处理的问题还没解决。不知为何。先交。

    # -*- coding:utf-8 -*-
    import requests
    from lxml import etree
    import csv
    
    import sys
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    start_url = "http://guba.eastmoney.com/list,meigu_1.html"
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 6.1;Win64;x64)"
                      "AppleWebKit / 537.36(KHTML, likeGecko)"
                      "Chrome / 58.0.3029.110"
                      "Safari / 537.36"
    }
    
    
    # def get_total_page(start_url):
    #    html = requests.get(url=start_url, headers=headers).content
    #    selector = etree.HTML(html)
    #    sum_page = selector.xpath("//span[@class='sumpage']/text()")
    #    return sum_page
    
    
    def parse_title():
        # sum_page = get_total_page(start_url)
        rows = []
        for num in range(1, 23):
            url = "http://guba.eastmoney.com/list,meigu_" + str(num) + ".html"
            html = requests.get(url=url, headers=headers).content
            selector = etree.HTML(html)
            items = selector.xpath("//div[@id='articlelistnew']/div[position()>1 and position()<last()]")
            for item in items:
                title = item.xpath("span[@class='l3']/a/text()")[0].decode(encoding='utf-8')
                author = item.xpath("span[@class='l4']/a/text()")
                read = item.xpath("span[@class='l1']/text()")[0]
                comment_num = item.xpath("span[@class='l2']/text()")[0]
                post_time = item.xpath("span[@class='l6']/text()")[0]
                last_update = item.xpath("span[@class='l5']/text()")[0]
                link = item.xpath("span[@class='l3']/a/@href")
                rows.append(
                    {'title': title, 'author': author, 'read': read, 'comment_num': comment_num, 'post_time': post_time,
                     'last_update': last_update, 'link': link})
        return rows
    
    
    def parse_content_comment():
        links = []
        temp = parse_title()
        for item in temp:
            links.append(item['link'][0])
        rows = []
        for link in links[0:8]:
            url = "http://guba.eastmoney.com/" + link
            html = requests.get(url=url, headers=headers).content
            selector = etree.HTML(html)
            lines = {}
            lines['content'] = selector.xpath("//div[@class='stockcodec']/text()")
            comments = selector.xpath("//div[@id='zwlist']")
            for item in comments:
                if item.xpath("div[@class='zwli clearfix']"):
                    name = item.xpath("div/div/div/div[@class='zwlianame']/span/a/text()")
                    comment = item.xpath("div/div/div/div[@class='zwlitext stockcodec']/text()")
                    time = item.xpath("div/div/div/div[@class='zwlitime']/text()")
                    lines['name'] = name
                    lines['comment'] = comment
                    lines['time'] = time
                else:
                    lines['name'] = 'none'
                    lines['comment'] = 'none'
                    lines['time'] = 'none'
                rows.append(lines)
        for link in links[8:]:
            url = "http://guba.eastmoney.com" + link
            html = requests.get(url=url, headers=headers).content
            selector = etree.HTML(html)
            lines = {}
            lines['content'] = selector.xpath("//div[@class='stockcodec']/text()")
            comments = selector.xpath("//div[@id='zwlist']")
            for item in comments:
                if item.xpath("div[@class='zwli clearfix']"):
                    name = item.xpath("div/div/div/div[@class='zwlianame']/span/a/text()")
                    comment = item.xpath("div/div/div/div[@class='zwlitext stockcodec']/text()")
                    time = item.xpath("div/div/div/div[@class='zwlitime']/text()")
                    lines['name'] = name
                    lines['comment'] = comment
                    lines['time'] = time
                else:
                    lines['name'] = 'none'
                    lines['comment'] = 'none'
                    lines['time'] = 'none'
                rows.append(lines)
        return rows
    
    
    if __name__ == "__main__":
        headlines1 = ['title', 'author', 'read', 'comment_num', 'post_time', 'last_update', 'link']
        headlines2 = ['content', 'name', 'comment', 'time']
        #    get_total_page(start_url)
        rows1 = parse_title()
        rows2 = parse_content_comment()
        with open('eastmoney1.csv', 'w') as f:
            f_csv = csv.DictWriter(f, headlines1)
            f_csv.writeheader()
            f_csv.writerows(rows1)
        with open('eastmoney2.csv', 'w') as f:
            f_csv = csv.DictWriter(f, headlines2)
            f_csv.writeheader()
            f_csv.writerows(rows2)
    

    思路很简单,代码也很好懂。后来突然发现评论还有分页的。先放着,把这里处理完先。
    结果

    image.png image.png

    有点丑陋。。。

    相关文章

      网友评论

        本文标题:交作业 爬美股吧

        本文链接:https://www.haomeiwen.com/subject/ktnkfxtx.html