百度知道爬取

作者: 懵懂_傻孩纸 | 来源:发表于2018-12-23 15:51 被阅读33次

    基本功能已经实现,代码待完善,可以满足基本爬取

    import requests
    from lxml import etree
    
    
    class ZhiDaoSpider(object):
       # 请求内容
       def __init__(self):
           self.headers = {
               "User - Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko)"
                               " Chrome / 72.0.3610.2 Safari / 537.36"
           }
           self.word = input("请输入需要搜索的内容:")
           self.page = 0
           self.base_url = "https://zhidao.baidu.com/search?word=%s&pn=" % self.word
       
       # 列表页url解析
       def send_request(self, url):
           response = requests.get(url=url, headers=self.headers)
           return response
       
       # 详情页url解析
       def send_request_detail(self, url_list):
    
           response = requests.get(url=url_list, headers=self.headers)
    
           return response
       
       # 列表页解析 得出每个详情页url
       def pares_page(self, response):
           html_obj = response.content
           # html_obj = html_obj.decode("gbk")
    
           html = etree.HTML(html_obj)
           link_list = html.xpath("//div[@id='wgt-list']/dl/dt/a/@href")
           item_list = []
           for link in link_list:
               item_list.append(link)
           return item_list
       
       # 详情页解析  得出每个详情的内容
       def pares_details(self, details):
           html_obj = etree.HTML(details.content)
           html_list = html_obj.xpath("//div[@id='wgt-ask']/h1/span[1]/text()")
           
           # 内容标题
           for title in html_list:
               print(title)
           
           # 文章内容
           details_list = html_obj.xpath("//div[@accuse='aContent']/text()")
           print(details_list)
    
       def main(self):
           page = 0
           while True:
               # 拼接url字符串
               full_url = self.base_url + str(page)
               print(full_url)
               
               response = self.send_request(full_url)
               link_list = self.pares_page(response)
               print(link_list)
               
               # 得到每个详情页url
               for deta in range(len(link_list)):
                   url = link_list[deta]
                   print(url)
                   details = self.send_request_detail(url)
                   self.pares_details(details)
    
               page += 10
               if page == 750:
                   break
    
    
    if __name__ == '__main__':
       spider = ZhiDaoSpider()
       spider.main()
    
    
    
    

    相关文章

      网友评论

        本文标题:百度知道爬取

        本文链接:https://www.haomeiwen.com/subject/resnkqtx.html