美文网首页
Python requests库 ————百度新闻

Python requests库 ————百度新闻

作者: chliar | 来源:发表于2018-03-19 00:06 被阅读0次
    # -*- coding:utf-8 -*-
    from email.mime.text import MIMEText
    import requests,pymysql,threading,logging
    import smtplib
    import time,datetime
    from lxml import etree
    import re
    
    
    class baidu_new(object):
    def __init__(self):
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        }
        # 手动输入搜索字段新闻
        # self.input = input('输入新闻(多个用逗号隔开): ').replace(',',',')
        # self.input = self.input.split(',')
    
        # 固定字段
        self.input = ['杨幂']
    
    # 多线程爬取
    def thread(self):
        for data_url in self.input:
            url ='http://news.baidu.com/ns?word={}&from=news&cl=2&rn=20&ct=1'.format(data_url)
            t = threading.Thread(target=self.get_url,args=(url,data_url))
            t.start()
            # print(self.get_url())
    
    # 请求网页
    def get_url(self,url,data_url):
        response = requests.get(url,headers = self.headers)
        # print(response.text)
        response=etree.HTML(response.content.decode('utf-8'))
        node_list=response.xpath('//div[@class="result"]')
        count = 0
        data_list =[]
        for node in node_list:
            count+=1
            item ={}
            title=node.xpath('./h3[@class="c-title"]/a/text()')
            title=str(title).replace(']','').replace('[','').replace(', ','%s'%data_url).replace("'",'')
            url = node.xpath('./h3[@class="c-title"]/a/@href')[0]
            start_time = node.xpath('./div[@class="c-summary c-row c-gap-top-small"]/div[@class="c-span18 c-span-last"]/p/text()|./div[@class="c-summary c-row "]/p/text()')
            # start_time = str(start_time).replace(']','').replace('[','').replace("'",'').replace(r"\xa0\xa0",'')
            start_time=  str(start_time).replace('[','').replace(']','').replace("'",'').split(r"\xa0\xa0")
            item['title']=title.replace("'",'"')
            item['source']=start_time[0].replace("'",'"')
            item['time']=start_time[1].replace("'",'"')
            item['url']=url
            # print(url,'***********************************************************')
            self.get_url_two(url)
            data_list.append(item)
            # print(title,start_time[0],start_time[1])
        if len(data_list)>0:
            new_data=self.mysql(data_list)
            print(new_data)
            if len(new_data)>0:
                self.smtp(new_data)
            # print(response.content.decode())
            # return response.content.decode()
            # print('%s条'%count)
    
    # 爬取新闻详细内容
    def get_url_two(self,url):
        response = requests.get(url,headers = self.headers)
    
        co_dece=re.findall('gb2312',str(response.content).lower())
        if len(co_dece) == 0:
            co_dece = re.findall('gbk', str(response.content).lower())
        if len(co_dece) == 0:
            co_dece = re.findall('utf-8', str(response.content).lower())
        # print(co_dece[0])
        response=etree.HTML(response.content.decode('{}'.format(co_dece[0]),'ignore'))
    
        # charset=response.xpath('//head/*/@charset|//head/*/@content')
    
    
        node_list=response.xpath('//*/h1/text()|//*/p/text()')
        data_list = ''
        for node in node_list:
            data_list +=node.replace('\n',"").replace('\t','').replace('\r','')
    
        print(data_list)
        # print(etree.tostring(response,encoding='utf-8').decode())
        # return response.html
        # return  etree.tostring(response,encoding='utf-8').decode()
    
    
    # 写入数据库(测试时要更改数据地址、账号、和密码)
    def mysql(self,data_list):
        new_data = []
        try:
            conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',charset='utf8')
            cs1 = conn.cursor()
        except Exception as e:
            print(e)
            cs1.close()
            conn.commit()
            conn.close()
        for item in data_list:
            try:
                count = cs1.execute("""select * from new where title = '%s'""" %item['title'])
            except:
                self.createmysql()
                count = cs1.execute("""select * from new where title = '%s'""" % item['title'])
            if count>0:
                pass
            else:
                new_data.append(item)
                print(item['title'])
                cs1.execute("""insert into new(title,source,time) values('%s','%s','%s')"""%(item['title'],item['source'],item['time']))
    
        cs1.close()
        conn.commit()
        conn.close()
        return new_data
    
    # 发送邮件,测试时设置收件人邮箱
    def smtp(self,new_data):
        msg_from = '2387765890@qq.com'  # 发送方邮箱
        passwd = 'jjqvnbtcedbzdhhi'  # 填入发送方邮箱的授权码
        msg_to = '847734623@qq.com'  # 收件人邮箱
    
        subject = "%s的最新新闻"%self.input[0]  # 主题
    
        content = ""  # 正文
        num = 0
        for new in new_data:
            num+=1
            content +='%s、'%num + str(new['title'])+'  来源于:'+str(new['source'])+'  发布时间:'+str(new['time'])+'\n'
        msg = MIMEText(content)
        msg['Subject'] = subject
        msg['From'] = msg_from
        msg['To'] = msg_to
        try:
            s = smtplib.SMTP_SSL("smtp.qq.com", 465)  # 邮件服务器及端口号
            s.login(msg_from, passwd)
            s.sendmail(msg_from, msg_to, msg.as_string())
            print("发送成功")
    
        except Exception as e:
            print("发送失败")
        finally:
            s.quit()
    
    # 创建数据表
    def createmysql(self):
        try:
            conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',
                                   charset='utf8')
            cs1 = conn.cursor()
            cs1.execute("""
                        create table new(
                        title varchar(1000),
                        source varchar(40),
                        time varchar(100)
                        );
                        """)
            cs1.close()
            conn.commit()
        except Exception as e:
            print(e)
    
    if __name__ == '__main__':
    # 定时爬取
    # def main(h=1, m=0):
    #     while True:
    #         now = datetime.datetime.now()
    #         print(now.hour, now.minute)
    #         if now.hour == 11 and now.minute == 40:  # 设置定时发送时间
    #             bn = baidu_new()
    #             bn.thread()
    #         time.sleep(60)  # 每隔60秒检测一次
    # main()
    def main(h=1, m=0):
        while True:
            now = datetime.datetime.now()
            print(now.hour, now.minute)
            bn = baidu_new()
            bn.thread()
            time.sleep(300)  # 每隔60秒检测一次
    main()
    
    # 手动测试
    #     bn = baidu_new()
    #     bn.thread()

    相关文章

      网友评论

          本文标题:Python requests库 ————百度新闻

          本文链接:https://www.haomeiwen.com/subject/kvcufftx.html