Python爬虫入门

作者: Yuu_CX | 来源:发表于2017-02-28 10:46 被阅读0次

3分钟带你了解世界第一语言Python 入门上手也这么简单！
Python网络爬虫（八） - 利用有道词典实现一个简单翻译程序
Python网络爬虫（七）- 深度爬虫CrawlSpider
Python网络爬虫（二）- urllib爬虫案例
Python网络爬虫（一）- 入门基础
Python网络爬虫（四）- XPath
Python网络爬虫（三）- 爬虫进阶
Python网络爬虫（六）- Scrapy框架
Python网络爬虫（五）- Requests和Beautifu
Python爬虫入门

获取图片并存入文件夹中

import urllib.request
response = urllib.request.urlopen('http://placekitten.com/1920/1280')
cat_img = response.read()
with open('cat_1920_1280.jpg','wb')as f:
    f.write(cat_img)

利用有道翻译

# -*- coding:utf-8 -*- 
import urllib.request
import urllib.parse
import json

content = input("请输入要翻译的内容：")
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'

head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 

data = {}
data['type']='AUTO'
data['i']=content
data['doctype']='json'
data['xmlVersion']='1.8'
data['keyfrom']='fanyi.web'
data['ue']='UTF-8'
data['action']='FY_BY_CLICKBUTTON'
data['typoResult']='true'
data = urllib.parse.urlencode(data).encode('utf_8')

req = urllib.request.Request(url,data,head)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')

target = json.loads(html)
print("翻译结果：%s"%(target['translateResult'][0][0]['tgt']))

Python爬虫将煎蛋网上的图片全部下载到本地

# -*- coding:utf-8 -*- 
import urllib.request
import os

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html
    
def get_page(url):
    html = url_open(url).decode('utf-8')
    a = html.find('current-comment-page')+23
    b = html.find(']',a)#从a位置开始找到位置坐标
    return html[a:b]#页码
    
def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_address = []
    a = html.find('img src=')
    while a!=-1:
        b = html.find('.jpg',a,a+255)#从a开始，到限定结束范围a+255
        if b != -1:
            img_address.append('http:'+html[a+9:b+4])
        else:
            b = a+9
        a = html.find('img src=',b)  
    return img_address

def save_imgs(folder,img_address):
    for each in img_address:
        filename = each.split('/')[-1] #取最后一个即图片名
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)
            
def download_mm(folder = 'ooxx',pages=10):
    os.mkdir(folder)
    os.chdir(folder)
    
    url = "http://jandan.net/ooxx"
    page_num = int(get_page(url))
    
    for i in range(pages):
        page_num -= i
        page_url = url+'/page-'+str(page_num)+'#comments'
        img_address = find_imgs(page_url)
        save_imgs(folder,img_address)
        
if __name__ =='__main__':
    download_mm()

Python爬虫将贴吧上的图片全部下载到本地

# -*- coding:utf-8 -*- 、
import urllib.request
import re

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html

def get_img(html):
    p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
    imglist = re.findall(p, str(html))
    for each in imglist:
        filename = each.split("/")[-1]
        urllib.request.urlretrieve(each, filename, None)
if __name__ == '__main__':
    url = 'http://tieba.baidu.com/p/3563409202'
    get_img(url_open(url))

爬豆瓣电影TOP250，参考

import pymysql
import requests
from bs4 import BeautifulSoup


#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评，没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
    db.close()

将豆瓣爬下来的电影详情按年份、国家或地区、类型等分好并写入MySQL数据库

import pymysql
import requests
from bs4 import BeautifulSoup
import re

#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评，没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists


if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        time VARCHAR(300) NOT NULL,
        country VARCHAR(300) NOT NULL,
        type VARCHAR(300) NOT NULL,
        drictor_artist VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        data=[]
        for i in lists:
             action = i["detail"]
             remove=re.compile(r'                            |\n|</br>|\.*')
             bd=re.sub(remove,"",action)
             bd=re.sub('<br>',"   ",bd)#去掉<br>
             bd=re.sub('/',"   ",bd)#替换/
             words=bd.split("   ")
             for s in words:
                  if len(s)!=0 and s!=' ':#去掉空白内容
                        data.append(s)
             i["time"] = data[-3][-5:]
             i["country"] = data[-2]
             i["type"] = data[-1]
             i["drictor_artist"] = data[0]
             # 插入数据到数据库sql语句，%s用作字符串占位
             sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
             try:
                 cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
                 db.commit()
                 print(i["name"]+" is success")
             except:
                 db.rollback()
    db.close()

可以将TOP250电影的年份画出来

豆瓣电影TOP250年代分布

3分钟带你了解世界第一语言Python 入门上手也这么简单！
一、Python入门 1. Python爬虫入门一之综述 Python爬虫入门二之爬虫基础了解 Python爬虫入...
Python网络爬虫（八） - 利用有道词典实现一个简单翻译程序
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（七）- 深度爬虫CrawlSpider
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（二）- urllib爬虫案例
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（一）- 入门基础
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（四）- XPath
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（三）- 爬虫进阶
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（六）- Scrapy框架
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python网络爬虫（五）- Requests和Beautifu
目录： Python网络爬虫（一）- 入门基础Python网络爬虫（二）- urllib爬虫案例Python网络爬...
Python爬虫入门
注：采转归档，自己学习查询使用 Python爬虫入门（1）：综述Python爬虫入门（2）：爬虫基础了解Pytho...