用python写一个cnBeta阅读器

作者: bigtrace | 来源:发表于2017-07-21 04:22 被阅读373次

我个人平时喜欢逛cnBeta和百度贴吧,我利用之前的写百度贴吧客户端的code, 写了一个cnBeta的阅读器

用python写一个百度贴吧客户端

由于cnBeta http://www.cnbeta.com/ 电脑端广告实在太多,要想阅读新闻和评论实在十分费时,于是我用Python抓取手机版 http://m.cnbeta.com/wap 的内容,方便大家阅读。

功能与界面与我之前的百度python客户端十分相似。

一打开便会显示首页的最新新闻,如果过想看第2页的新闻则输入s 2, 以此类推。

s 2

阅读某一个新闻,则输入t index , 比如查看index 为1 的新闻

t 1

不用你自己亲自查看评论,程序会抓取所有评论直接显示在文章下方,

输入 b 可以返回新闻列表。

由于我自己不喜欢评论,所以我没有添加评论该新闻的功能。想要加评论功能,也很简单,可以参考我的百度客户端的文章。

新增预览图片功能

输入pic, 可以打开由Pyqt库写的一个小窗口,用来预览该新闻内的图片,并且可以上下翻页。

一下附上code:

# coding=utf-8
import sys
import pycurl
import os
import time
from StringIO import StringIO
import re
import lxml.html
import unicodedata
from PyQt4.QtGui import *
from PyQt4 import QtGui
from colorama import Fore, Back, Style,init
from termcolor import colored


# class definition

class Example(QtGui.QWidget):


   def __init__(self,all_pic_list):
      super(Example, self).__init__()
      #self.url_list=['http://static.cnbetacdn.com/article/2017/0831/8eb7de909625140.png','http://static.cnbetacdn.com/article/2017/0831/7f11d5ec94fa123.png','http://static.cnbetacdn.com/article/2017/0831/1b6595175fb5486.jpg']
      self.url_list=all_pic_list
      self.current_pic_index=0
      self.initUI()
      #time.sleep(5)

   def initUI(self):
      QtGui.QToolTip.setFont(QtGui.QFont('Test', 10))
      self.setToolTip('This is a <b>QWidget</b> widget')

      # Show  image
      self.pic = QtGui.QLabel(self)
      self.pic.setGeometry(0, 0, 600, 500)
      #self.pic.setPixmap(QtGui.QPixmap("/home/lpp/Desktop/image1.png"))

      
      
      pixmap = QPixmap()
      data=self.retrieve_from_url(self.url_list[0])
      pixmap.loadFromData(data)
      self.pic.setPixmap(pixmap)
      #self.pic.setPixmap(QtGui.QPixmap.loadFromData(data))


      # Show button 
      btn_next = QtGui.QPushButton('Next', self)
      btn_next.setToolTip('This is a <b>QPushButton</b> widget')
      btn_next.resize(btn_next.sizeHint())
      btn_next.clicked.connect(self.fun_next)
      btn_next.move(300, 50)

      btn_prev = QtGui.QPushButton('Previous', self)
      btn_prev.setToolTip('This is a <b>QPushButton</b> widget')
      btn_prev.resize(btn_prev.sizeHint())
      btn_prev.clicked.connect(self.fun_prev)
      btn_prev.move(50, 50)



      self.setGeometry(300, 300, 500, 500)
      self.setWindowTitle('ImgViewer')
      self.show()

   def retrieve_from_url(self,pic_url):
      c = pycurl.Curl()
      c.setopt(pycurl.PROXY, 'http://192.168.87.15:8080')
      c.setopt(pycurl.PROXYUSERPWD, 'LL66269:')
      c.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_NTLM)
      buffer = StringIO()
      c.setopt(pycurl.URL, pic_url)
      c.setopt(c.WRITEDATA, buffer)
      c.perform()
      c.close()  
      data = buffer.getvalue()
      return data  

    # Connect button to image updating 
   def fun_next(self):
      if self.current_pic_index < len(self.url_list)-1:
         self.current_pic_index=self.current_pic_index+1
      else:
         self.current_pic_index=0


      pixmap = QPixmap()
      data=self.retrieve_from_url(self.url_list[self.current_pic_index])
      pixmap.loadFromData(data)
      self.pic.setPixmap(pixmap)
      #self.pic.setPixmap(QtGui.QPixmap( "/home/lpp/Desktop/image2.png"))


   def fun_prev(self):
      if self.current_pic_index > 0:
         self.current_pic_index=self.current_pic_index-1
      else:
         self.current_pic_index=len(self.url_list)-1

      pixmap = QPixmap()
      data=self.retrieve_from_url(self.url_list[self.current_pic_index])
      pixmap.loadFromData(data)
      self.pic.setPixmap(pixmap)
      #self.pic.setPixmap(QtGui.QPixmap( "/home/lpp/Desktop/image2.png"))


def main(all_pic_list):

   app = QtGui.QApplication(sys.argv)
   ex = Example(all_pic_list)
   sys.exit(app.exec_())


#---------------------------------------------
class Browser_cnbeta:
    c = pycurl.Curl()

    def __init__(self):
        os.system('cls')
        print """
        
               _                       _                     ____  _____ _____  _    
 __      _____| | ___ ___  _ __ ___   | |_ ___     ___ _ __ | __ )| ____|_   _|/ \   
 \ \ /\ / / _ \ |/ __/ _ \| '_ ` _ \  | __/ _ \   / __| '_ \|  _ \|  _|   | | / _ \  
  \ V  V /  __/ | (_| (_) | | | | | | | || (_) | | (__| | | | |_) | |___  | |/ ___ \ 
   \_/\_/ \___|_|\___\___/|_| |_| |_|  \__\___/   \___|_| |_|____/|_____| |_/_/   \_\
                                                                                     
made by bigtrace
http://www.jianshu.com/p/f04e514c2902
7/20/2017
        """
        time.sleep(2)
        self.read_shouye(1)

    def wide_chars(self, s):
        # return the extra width for wide characters
        if isinstance(s, str):
            s = s.decode('utf-8')
        return sum(unicodedata.east_asian_width(x) in ('F', 'W') for x in s)

    def read_shouye(self, index):
        
        os.system('cls')
        self.c.setopt(pycurl.PROXY, 'http://192.168.87.15:8080')
        self.c.setopt(pycurl.PROXYUSERPWD, 'LL66269:')
        self.c.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_NTLM)
        USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        self.c.setopt(self.c.FOLLOWLOCATION, 1)
        self.c.setopt(pycurl.VERBOSE, 0)
        self.c.setopt(pycurl.FAILONERROR, True)
        self.c.setopt(pycurl.USERAGENT, USER_AGENT)

        # ------------------- Need to use each post page's own cookie to login
        url_tbs = 'http://m.cnbeta.com/wap/index.htm?page=' + str(index)
        print colored(url_tbs,'blue')

        print (colored("\n---------------------",'green'))
        buffer = StringIO()
        self.c.setopt(pycurl.URL, url_tbs)
        self.c.setopt(self.c.WRITEDATA, buffer)
        self.c.perform()
        body = buffer.getvalue().decode('utf-8', 'ignore')
        doc = lxml.html.fromstring(body)
        news_list = doc.xpath("//div[@class='list']")

        # http://m.cnbeta.com/wap/view/633687.htm



        Header_list = []
        link_list = []
        display_shouye = []

        self.header_max_width = 12
        self.title_max_width = 70
        i = 0
        for each_news in news_list:
            link = each_news.xpath(".//a/@href")[0]
            link_url = "http://m.cnbeta.com" + link
            title = each_news.xpath(".//a")[0].text_content()

            Header = "index " + colored(str(i),'yellow')

            each_title = ":   " + title
            Header_list.append(title)
            link_list.append(link_url)

            Header_fmt = u'{0:<%s}' % (self.header_max_width - self.wide_chars(Header))
            Title_fmt = u'{0:<%s}' % (self.title_max_width - self.wide_chars(each_title))
            each_display = ""
            try:
                each_display = (Header_fmt.format(Header) + Title_fmt.format(each_title)).encode("gb18030")
                # print (Header_fmt.format(Header)+Title_fmt.format(each_title)).encode("gb18030")

            except:
                each_display = (Header_fmt.format(Header) + "Title can't be displayed").encode("gb18030")
                # print (Header_fmt.format(Header)+"Title can't be displayed").encode("gb18030")


            print  each_display

            display_shouye.append(each_display)
            print ""
            i = i + 1

        self.tiezi_link = link_list
        self.shouye_titles = Header_list
        self.display_shouye_list = display_shouye

        print (colored("\n---------------------",'green'))

    def read_each_news(self, index):
        os.system('cls')
        link = self.tiezi_link[int(index)]
        title = self.shouye_titles[int(index)]
        print "===================================================\n\n\n"

        print colored(title, 'magenta')+ colored("  <" + link+ "> \n",'blue')  


        buffer = StringIO()
        self.c.setopt(pycurl.URL, link)
        self.c.setopt(self.c.WRITEDATA, buffer)
        self.c.perform()
        body = buffer.getvalue().decode('utf-8', 'ignore')
        doc = lxml.html.fromstring(body)
        title = doc.xpath("//div[@class='title']")[0].text_content()
        time = doc.xpath("//div[@class='time']/span")
        time_subtitle = ""
        for each_span in time:
            time_subtitle = time_subtitle + each_span.text_content()

        # print (title).encode("gb18030")

        print ""

        print colored((time_subtitle).encode("gb18030"),'cyan')

        print "" 

        content = doc.xpath("//div[@class='content']/p")
        self.current_thread_pic_list=[]
        for each_paragraph in content:
            print ""
            text_content = each_paragraph.text_content().replace(u'\xa0', u' ')   # remove \xa0 from string
            print text_content
            img = each_paragraph.xpath(".//img/@src")
            for each_img in img:
                print colored("<img url: "+each_img+">",'yellow') 
                self.current_thread_pic_list.append(each_img)

        blockquote = doc.xpath("//div[@class='content']/blockquote")
        j = 1
        for each_blockquote in blockquote:
            print "blockquote <" + str(j) + "> ~~~~~~~~~~~\n"
            print each_blockquote.text_content()
            print "~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
            j = j + 1

        self.view_comment(link)

    def Get_Back_To_shouye(self):
        os.system('cls')
        for each_display in self.display_shouye_list:
            print each_display

    def exit(self):
        self.c.close()
        os.system('cls')
        print """
 _                    _                 
| |                  | |                
| |__  _   _ _____   | |__  _   _ _____ 
|  _ \| | | | ___ |  |  _ \| | | | ___ |
| |_) ) |_| | ____|  | |_) ) |_| | ____|
|____/ \__  |_____)  |____/ \__  |_____)
      (____/               (____/       

"""
        time.sleep(1)
        os.system('cls')

    def view_comment(self, url):
        # http://m.cnbeta.com/wap/comment/633621.htm
        tid = re.search(r"(\d+)", url).group(1)
        comment_url = "http://m.cnbeta.com/wap/comment/" + str(tid) + ".htm?page="

        buffer = StringIO()
        self.c.setopt(pycurl.URL, comment_url + "1")
        self.c.setopt(self.c.WRITEDATA, buffer)
        self.c.perform()
        body = buffer.getvalue().decode('utf-8', 'ignore')
        doc = lxml.html.fromstring(body)


        comment_all = doc.xpath("//div[@class='content']")[0].text_content()
        print colored("\n--------------- comment ---------------",'green')
        #print comment_all
        print comment_all

        print colored("--------------- finished ---------------",'green')
        


    def view_image(self):
        print "launch picture viewer..."
        viewer_app = QtGui.QApplication(sys.argv)
        ex = Example(self.current_thread_pic_list)
        sys.exit(viewer_app.exec_())


app = Browser_cnbeta()

while True:
    print """



    """
    nb = raw_input('Give me your command: \n')
    try:
        if nb.startswith('s ') == True:
            index = re.search(r"s (\d+)", nb).group(1)
            app.read_shouye(index)
        elif nb.startswith('t ') == True:
            index = re.search(r"t\s+(\d+)", nb).group(1)
            app.read_each_news(index)
        elif nb == "b":
            app.Get_Back_To_shouye()
        elif nb =="c":
            os.system('cls')  # on windows
        elif nb == "e":
            break
        elif nb == "pic":
            app.view_image()
        else:
            print "type correct command"
    except:
        print ""




app.exit()

相关文章

网友评论

    本文标题:用python写一个cnBeta阅读器

    本文链接:https://www.haomeiwen.com/subject/kgdskxtx.html