美文网首页
python sina

python sina

作者: laod_wh | 来源:发表于2019-09-25 18:08 被阅读0次
    import time
    from datetime import datetime
    import requests
    from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, TimeoutException
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.support import  expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium import  webdriver
    from selenium.webdriver.common.by import By
    import lxml.html
    etree=lxml.html.etree
    def  tit():
    
        respond=webdriver.Firefox()
        #使用Firefox浏览器,当然你也可以将Firefox设置成无头类型,这样更快
        respond.get('https://news.sina.com.cn/world/')
        wait=WebDriverWait(respond,3)
        #设置显示等待3秒
        while True:
            try:
                input1 = wait.until(
                EC.presence_of_element_located((By.XPATH, '//span[@class="pagebox_next"]'))
                )
                #直到找到定位元素位置
                break
    
            except TimeoutException:
                js ='var q=document.documentElement.scrollTop=100000'
                respond.execute_script(js)
                #超时就通过解析js进行下拉滑块
        html = respond.page_source
        #获取当前页面源码
        ff = etree.HTML(html)
        #解析当前页面原码
        xx0=ff.xpath('//div[@id="subShowContent1_news1"]/div[@class="news-item first-news-item img-news-item"]/h2/a/text()')
        xx1= ff.xpath('//div[@id="subShowContent1_news2"]/div[@class="news-item  img-news-item"]/h2/a/text()')
        xx2 = ff.xpath('//div[@id="subShowContent1_news4"]/div[@class="news-item  img-news-item"]/h2/a/text()')
        xx3= ff.xpath('//div[@id="subShowContent1_news4"]/div[@class="news-item"]/h2/a/text()')
        ss0=ff.xpath('//div[@id="subShowContent1_news1"]/div[@class="news-item first-news-item img-news-item"]/h2/a/@href')
        ss1 = ff.xpath('//div[@id="subShowContent1_news2"]/div[@class="news-item  img-news-item"]/h2/a/@href')
        ss2=ff.xpath('//div[@id="subShowContent1_news4"]/div[@class="news-item  img-news-item"]/h2/a/@href')
        ss3=ff.xpath('//div[@id="subShowContent1_news4"]/div[@class="news-item"]/h2/a/@href')
        t=xx0+xx1+xx2+xx3
        s=ss0+ss1+ss2+ss3
        #爬取所有链接及新闻标题
        return t,s
    def xx(ss):
        ee=requests.get(ss)
        ee.encoding='utd-8'
        mm=etree.HTML(ee.text)
        dd = mm.xpath('//span[@class="date"]/text()')
        time = datetime.strptime(dd[0], '%Y年%m月%d日 %H:%M')
        #将dd转换成时间格式
        print(time)
    if __name__ =='__main__':
    
        t=tit()
        print(t[0])
        print(t[1])
        print(len(t[1]))
        for title,ss in zip(t[0],t[1]):
            xx(ss)
        #通过for遍历得到每个链接传入xx方法,这里我只爬取时间,其他的同理添加可得
    

    相关文章

      网友评论

          本文标题:python sina

          本文链接:https://www.haomeiwen.com/subject/jkeiyctx.html