美文网首页
Python爬虫入门

Python爬虫入门

作者: Yuu_CX | 来源:发表于2017-02-28 10:46 被阅读0次

    获取图片并存入文件夹中

    import urllib.request
    response = urllib.request.urlopen('http://placekitten.com/1920/1280')
    cat_img = response.read()
    with open('cat_1920_1280.jpg','wb')as f:
        f.write(cat_img)
    

    利用有道翻译

    # -*- coding:utf-8 -*- 
    import urllib.request
    import urllib.parse
    import json
    
    content = input("请输入要翻译的内容:")
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'
    
    head = {}
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 
    
    data = {}
    data['type']='AUTO'
    data['i']=content
    data['doctype']='json'
    data['xmlVersion']='1.8'
    data['keyfrom']='fanyi.web'
    data['ue']='UTF-8'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    data = urllib.parse.urlencode(data).encode('utf_8')
    
    req = urllib.request.Request(url,data,head)
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    
    target = json.loads(html)
    print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))
    

    Python爬虫将煎蛋网上的图片全部下载到本地

    # -*- coding:utf-8 -*- 
    import urllib.request
    import os
    
    def url_open(url):
        req = urllib.request.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
        response = urllib.request.urlopen(url)
        html = response.read()
        return html
        
    def get_page(url):
        html = url_open(url).decode('utf-8')
        a = html.find('current-comment-page')+23
        b = html.find(']',a)#从a位置开始找到位置坐标
        return html[a:b]#页码
        
    def find_imgs(url):
        html = url_open(url).decode('utf-8')
        img_address = []
        a = html.find('img src=')
        while a!=-1:
            b = html.find('.jpg',a,a+255)#从a开始,到限定结束范围a+255
            if b != -1:
                img_address.append('http:'+html[a+9:b+4])
            else:
                b = a+9
            a = html.find('img src=',b)  
        return img_address
    
    def save_imgs(folder,img_address):
        for each in img_address:
            filename = each.split('/')[-1] #取最后一个即图片名
            with open(filename,'wb') as f:
                img = url_open(each)
                f.write(img)
                
    def download_mm(folder = 'ooxx',pages=10):
        os.mkdir(folder)
        os.chdir(folder)
        
        url = "http://jandan.net/ooxx"
        page_num = int(get_page(url))
        
        for i in range(pages):
            page_num -= i
            page_url = url+'/page-'+str(page_num)+'#comments'
            img_address = find_imgs(page_url)
            save_imgs(folder,img_address)
            
    if __name__ =='__main__':
        download_mm()
    

    Python爬虫将贴吧上的图片全部下载到本地

    # -*- coding:utf-8 -*- 、
    import urllib.request
    import re
    
    def url_open(url):
        req = urllib.request.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
        response = urllib.request.urlopen(url)
        html = response.read()
        return html
    
    def get_img(html):
        p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
        imglist = re.findall(p, str(html))
        for each in imglist:
            filename = each.split("/")[-1]
            urllib.request.urlretrieve(each, filename, None)
    if __name__ == '__main__':
        url = 'http://tieba.baidu.com/p/3563409202'
        get_img(url_open(url))
    

    爬豆瓣电影TOP250,参考

    import pymysql
    import requests
    from bs4 import BeautifulSoup
    
    
    #%d用作数字占位
    baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
    def get_movies(start):
        url = baseUrl % start
        lists = []
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
        items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
        for i in items:
            movie = {}      # 临时存取电影的数据
            movie["rank"] = i.find("em").text   # 电影排行榜
            movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
            movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
            movie["name"] = i.find("span", "title").text    # 电影名字
            movie["score"] = i.find("span", "rating_num").text  # 电影评分
            movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
            movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
            movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
            movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
            lists.append(movie) # 保存到返回数组中
        return lists
    
    if __name__ == "__main__":
         # 连接数据库,需指定charset否则可能会报错
        db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
        cursor = db.cursor()
        cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
        # 创建表sql语句
        createTab = """CREATE TABLE movies(
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            name VARCHAR(20) NOT NULL,
            rank VARCHAR(4) NOT NULL,
            link VARCHAR(50) NOT NULL,
            poster VARCHAR(100) NOT NULL,
            score VARCHAR(4) NOT NULL,
            other VARCHAR(100) NOT NULL,
            quote VARCHAR(50),
            detail VARCHAR(300) NOT NULL,
            comment_num VARCHAR(100) NOT NULL
        )"""
        cursor.execute(createTab)
        for start in range(0,250,25):
            lists = get_movies(start)# 获取提取到数据
            for i in lists:
                 # 插入数据到数据库sql语句,%s用作字符串占位
                sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                try:
                    cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
                    db.commit()
                    print(i["name"]+" is success")
                except:
                    db.rollback()
        db.close()
    

    将豆瓣爬下来的电影详情按年份、国家或地区、类型等分好并写入MySQL数据库

    import pymysql
    import requests
    from bs4 import BeautifulSoup
    import re
    
    #%d用作数字占位
    baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
    def get_movies(start):
        url = baseUrl % start
        lists = []
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
        items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
        for i in items:
            movie = {}      # 临时存取电影的数据
            movie["rank"] = i.find("em").text   # 电影排行榜
            movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
            movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
            movie["name"] = i.find("span", "title").text    # 电影名字
            movie["score"] = i.find("span", "rating_num").text  # 电影评分
            movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
            movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评,没有就设为空
            movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
            movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
            lists.append(movie) # 保存到返回数组中
        return lists
    
    
    if __name__ == "__main__":
         # 连接数据库,需指定charset否则可能会报错
        db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
        cursor = db.cursor()
        cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
        # 创建表sql语句
        createTab = """CREATE TABLE movies(
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            name VARCHAR(20) NOT NULL,
            rank VARCHAR(4) NOT NULL,
            link VARCHAR(50) NOT NULL,
            poster VARCHAR(100) NOT NULL,
            score VARCHAR(4) NOT NULL,
            other VARCHAR(100) NOT NULL,
            quote VARCHAR(50),
            detail VARCHAR(300) NOT NULL,
            time VARCHAR(300) NOT NULL,
            country VARCHAR(300) NOT NULL,
            type VARCHAR(300) NOT NULL,
            drictor_artist VARCHAR(300) NOT NULL,
            comment_num VARCHAR(100) NOT NULL
        )"""
        cursor.execute(createTab)
        for start in range(0,250,25):
            lists = get_movies(start)# 获取提取到数据
            data=[]
            for i in lists:
                 action = i["detail"]
                 remove=re.compile(r'                            |\n|</br>|\.*')
                 bd=re.sub(remove,"",action)
                 bd=re.sub('<br>',"   ",bd)#去掉<br>
                 bd=re.sub('/',"   ",bd)#替换/
                 words=bd.split("   ")
                 for s in words:
                      if len(s)!=0 and s!=' ':#去掉空白内容
                            data.append(s)
                 i["time"] = data[-3][-5:]
                 i["country"] = data[-2]
                 i["type"] = data[-1]
                 i["drictor_artist"] = data[0]
                 # 插入数据到数据库sql语句,%s用作字符串占位
                 sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                 try:
                     cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
                     db.commit()
                     print(i["name"]+" is success")
                 except:
                     db.rollback()
        db.close()
    

    可以将TOP250电影的年份画出来

    豆瓣电影TOP250年代分布

    相关文章

      网友评论

          本文标题:Python爬虫入门

          本文链接:https://www.haomeiwen.com/subject/vvhxgttx.html