Python requests库 ————百度新闻

作者: chliar | 来源:发表于2018-03-19 00:06 被阅读0次

Python requests库 ————百度新闻
python爬虫脚本下载视频，同时借助FFmpeg合并视频
2019-01-01
python requests 库教程
[Python]从Web解析到网络空间（一些第三方库的简要介绍）
2019-01-09 python 库之 requests
2020-05-25 学习python爬虫系列（二）：Reque
Python Requests库用法
Python爬虫之爬虫利器集合
python接口测试

# -*- coding:utf-8 -*-
from email.mime.text import MIMEText
import requests,pymysql,threading,logging
import smtplib
import time,datetime
from lxml import etree
import re


class baidu_new(object):
def __init__(self):
    self.headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
    }
    # 手动输入搜索字段新闻
    # self.input = input('输入新闻（多个用逗号隔开）: ').replace('，',',')
    # self.input = self.input.split(',')

    # 固定字段
    self.input = ['杨幂']

# 多线程爬取
def thread(self):
    for data_url in self.input:
        url ='http://news.baidu.com/ns?word={}&from=news&cl=2&rn=20&ct=1'.format(data_url)
        t = threading.Thread(target=self.get_url,args=(url,data_url))
        t.start()
        # print(self.get_url())

# 请求网页
def get_url(self,url,data_url):
    response = requests.get(url,headers = self.headers)
    # print(response.text)
    response=etree.HTML(response.content.decode('utf-8'))
    node_list=response.xpath('//div[@class="result"]')
    count = 0
    data_list =[]
    for node in node_list:
        count+=1
        item ={}
        title=node.xpath('./h3[@class="c-title"]/a/text()')
        title=str(title).replace(']','').replace('[','').replace(', ','%s'%data_url).replace("'",'')
        url = node.xpath('./h3[@class="c-title"]/a/@href')[0]
        start_time = node.xpath('./div[@class="c-summary c-row c-gap-top-small"]/div[@class="c-span18 c-span-last"]/p/text()|./div[@class="c-summary c-row "]/p/text()')
        # start_time = str(start_time).replace(']','').replace('[','').replace("'",'').replace(r"\xa0\xa0",'')
        start_time=  str(start_time).replace('[','').replace(']','').replace("'",'').split(r"\xa0\xa0")
        item['title']=title.replace("'",'"')
        item['source']=start_time[0].replace("'",'"')
        item['time']=start_time[1].replace("'",'"')
        item['url']=url
        # print(url,'***********************************************************')
        self.get_url_two(url)
        data_list.append(item)
        # print(title,start_time[0],start_time[1])
    if len(data_list)>0:
        new_data=self.mysql(data_list)
        print(new_data)
        if len(new_data)>0:
            self.smtp(new_data)
        # print(response.content.decode())
        # return response.content.decode()
        # print('%s条'%count)

# 爬取新闻详细内容
def get_url_two(self,url):
    response = requests.get(url,headers = self.headers)

    co_dece=re.findall('gb2312',str(response.content).lower())
    if len(co_dece) == 0:
        co_dece = re.findall('gbk', str(response.content).lower())
    if len(co_dece) == 0:
        co_dece = re.findall('utf-8', str(response.content).lower())
    # print(co_dece[0])
    response=etree.HTML(response.content.decode('{}'.format(co_dece[0]),'ignore'))

    # charset=response.xpath('//head/*/@charset|//head/*/@content')


    node_list=response.xpath('//*/h1/text()|//*/p/text()')
    data_list = ''
    for node in node_list:
        data_list +=node.replace('\n',"").replace('\t','').replace('\r','')

    print(data_list)
    # print(etree.tostring(response,encoding='utf-8').decode())
    # return response.html
    # return  etree.tostring(response,encoding='utf-8').decode()


# 写入数据库（测试时要更改数据地址、账号、和密码）
def mysql(self,data_list):
    new_data = []
    try:
        conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',charset='utf8')
        cs1 = conn.cursor()
    except Exception as e:
        print(e)
        cs1.close()
        conn.commit()
        conn.close()
    for item in data_list:
        try:
            count = cs1.execute("""select * from new where title = '%s'""" %item['title'])
        except:
            self.createmysql()
            count = cs1.execute("""select * from new where title = '%s'""" % item['title'])
        if count>0:
            pass
        else:
            new_data.append(item)
            print(item['title'])
            cs1.execute("""insert into new(title,source,time) values('%s','%s','%s')"""%(item['title'],item['source'],item['time']))

    cs1.close()
    conn.commit()
    conn.close()
    return new_data

# 发送邮件，测试时设置收件人邮箱
def smtp(self,new_data):
    msg_from = '2387765890@qq.com'  # 发送方邮箱
    passwd = 'jjqvnbtcedbzdhhi'  # 填入发送方邮箱的授权码
    msg_to = '847734623@qq.com'  # 收件人邮箱

    subject = "%s的最新新闻"%self.input[0]  # 主题

    content = ""  # 正文
    num = 0
    for new in new_data:
        num+=1
        content +='%s、'%num + str(new['title'])+'  来源于：'+str(new['source'])+'  发布时间：'+str(new['time'])+'\n'
    msg = MIMEText(content)
    msg['Subject'] = subject
    msg['From'] = msg_from
    msg['To'] = msg_to
    try:
        s = smtplib.SMTP_SSL("smtp.qq.com", 465)  # 邮件服务器及端口号
        s.login(msg_from, passwd)
        s.sendmail(msg_from, msg_to, msg.as_string())
        print("发送成功")

    except Exception as e:
        print("发送失败")
    finally:
        s.quit()

# 创建数据表
def createmysql(self):
    try:
        conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',
                               charset='utf8')
        cs1 = conn.cursor()
        cs1.execute("""
                    create table new(
                    title varchar(1000),
                    source varchar(40),
                    time varchar(100)
                    );
                    """)
        cs1.close()
        conn.commit()
    except Exception as e:
        print(e)

if __name__ == '__main__':
# 定时爬取
# def main(h=1, m=0):
#     while True:
#         now = datetime.datetime.now()
#         print(now.hour, now.minute)
#         if now.hour == 11 and now.minute == 40:  # 设置定时发送时间
#             bn = baidu_new()
#             bn.thread()
#         time.sleep(60)  # 每隔60秒检测一次
# main()
def main(h=1, m=0):
    while True:
        now = datetime.datetime.now()
        print(now.hour, now.minute)
        bn = baidu_new()
        bn.thread()
        time.sleep(300)  # 每隔60秒检测一次
main()

# 手动测试
#     bn = baidu_new()
#     bn.thread()