美文网首页
Python text

Python text

作者: laod_wh | 来源:发表于2019-11-27 17:24 被阅读0次

最近突然想看看电子书,然后想看的那本书呀,还得下app或者在线看,太麻烦,于是写了个简单抓取:

#!/usr/bin/python
# -*- coding: utf-8 -*-
# http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

import http.cookiejar  # import cookielib
import os
import io
import random
import urllib
import uuid
import bs4
import datetime
import requests
import sys
import time
import re
from config_info import *
from bs4 import BeautifulSoup as BS
import pymysql
import lxml.html
import re
import gzip
etree = lxml.html.etree

# 浏览器代理
agents = [
    #Firefox
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    #chrome
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    #UC浏览器
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    #IPhone
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    #IPod
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    #IPAD
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    #Android
    "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
]

# 使用代理防止被封
proxies = {}

# 解析html页面 soup
def getSoup(page):
    headers = {
        # 随机取请求头
        "User-Agent": random.choice(agents),
        "Host": "www.xuehong.cc",
        "Referer": "https://www.xuehong.cc/book/9133/"
    }

    req = urllib.request.urlopen(page)
    req.addheaders = [headers]


    html = req.read().decode('utf-8', errors='ignore')
    soup = BS(html, 'html.parser')
    # sleep for several secs
    randint_data = random.randint(2, 5)
    if randint_data < 2:
        randint_data = 2
    time.sleep(randint_data)
    return soup

def main():
    try:
        print('------------start---------------');
        tager_ = range(31445761, 36622777,1)
        for i in tager_:
            url_ = "https://www.xuehong.cc/book/9133/"+str(i)+".html"
            print("current page:"+" ---> "+url_)
            txt_file = open("/Volumes/LaCie/mac/python/book.txt", "a", encoding="utf-8")  # 以写的格式打开先打开文件

            page_soup = getSoup(url_)  # 解析网页
            tmp = page_soup.find('div', id='content').text.strip();

            bookname = page_soup.find('div', class_='bookname')
            bookname = bookname.find('h1').text.strip();

            #print(tmp)
            txt_file.write(bookname)
            txt_file.write("\n")
            txt_file.write(tmp)
            txt_file.write("\n")
            txt_file.close()
        print('------------success end---------------');
    except Exception as e:
        print('------------Exception !!!---------------');
        print(e);


if __name__ == "__main__":
    main()

运行日志:

pydev debugger: process 12236 is connecting

Connected to pydev debugger (build 171.4694.70)
------------start---------------
current page: ---> https://www.xuehong.cc/book/9133/31445761.html
current page: ---> https://www.xuehong.cc/book/9133/31445762.html
current page: ---> https://www.xuehong.cc/book/9133/31445763.html
current page: ---> https://www.xuehong.cc/book/9133/31445764.html
current page: ---> https://www.xuehong.cc/book/9133/31445765.html
current page: ---> https://www.xuehong.cc/book/9133/31445766.html
current page: ---> https://www.xuehong.cc/book/9133/31445767.html
current page: ---> https://www.xuehong.cc/book/9133/31445768.html
current page: ---> https://www.xuehong.cc/book/9133/31445769.html

运行文件:


image.png

相关文章

网友评论

      本文标题:Python text

      本文链接:https://www.haomeiwen.com/subject/pwxmwctx.html