美文网首页
爬虫:爬取扇贝小组发帖

爬虫:爬取扇贝小组发帖

作者: 洋阳酱 | 来源:发表于2019-12-24 13:42 被阅读0次

获取扇贝小组发帖

  • 默认获取前5页数据,可以通过for page in range(1,5):调整获取的页数。
小组帖子详情
  • 默认进入“再次出发的30岁”小组,获取小组帖子。可通过调整网址ID,更换小组。
小组网址
# -*- coding: utf-8 -*-
"""
Created on Sat Jun  8 20:00:41 2019

@author: YangYang
"""

import requests
import json
import re
import xlwt
  
def Login(account,password):
    payloadHeader = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
                     'referer': "https://web.shanbay.com/web/account/login/",
                     'content-type': 'application/json'
                     }

    postUrl = 'https://apiv3.shanbay.com/bayuser/login'
    s = requests.Session()  # 为了保存登入信息

    PayloadData  = {
                     'account': account,
                     'code_2fa': "",
                     'password': password     
                    }
    r = s.post(postUrl, data=json.dumps(PayloadData), headers=payloadHeader)
    return s,r

def GetTextWeb(url,s):
    GetUrl = s.get(url)
    return GetUrl.text

# 输入账号密码登入  
account =input("请输入你的账号:")
password = input("请输入你的密码:")

s,r = Login(account,password)
print('\n') 
print("账号登入成功!")

# 打开Excel
workbook = xlwt.Workbook()  #定义workbook
sheet = workbook.add_sheet('社区')  #添加sheet
head = [ '标题','链接','作者']
for h in range(len(head)):
    sheet.write(0, h, head[h])    #把表头写到Excel里面去

print('\n') 
print("Excel汇总表已创建")
print('\n') 

i = 1
j = 1
k = 1

# 默认爬取5页
for page in range(1,5):    
    print("正在取读第{}页数据".format(page))
    web = 'https://www.shanbay.com/api/v1/team/44584/thread/?page='+str(page)
    Text_article = GetTextWeb(web,s)
    
    # 提取标题
    e_title = []
    list_title = re.findall("(\"title\": \".*?\"),",Text_article)

    for title in list_title:
        #title = list_title[1]
        title = "{"+title+"}"  #转成字典     
        title_dict = json.loads(title)
        #title_dict = json.loads(title)
        # e_title.append(title_dict['title'])
        if not (title_dict['title'] == "再次出发的30岁"):
            sheet.write(i, 0, "["+title_dict['title']+"]")
            i += 1
        
    #  提取作者
    e_nickname = []
    list_nickname = re.findall("(\"nickname.*?),",Text_article)

    for nickname in list_nickname: 
        nickname = "{"+nickname+"}"  #转成字典
        nickname_dict = json.loads(nickname)
        # e_nickname.append(nickname_dict['nickname'])  
        sheet.write(j, 2, nickname_dict['nickname'])
        j += 1
    
 
    #  提取网址      
    e_share_url = []
    list_share_url = re.findall("(\"share_url\".*?),",Text_article)

    for share_url in list_share_url: 
        share_url = "{"+share_url+"}"  #转成字典
        share_url_dict = json.loads(share_url)
        # e_share_url.append(share_url_dict['share_url'])  
        sheet.write(k, 1, "("+share_url_dict['share_url']+")")
        k += 1      
        
workbook.save("扇贝小组发帖汇总.xls")
print('\n') 
print('写入excel成功')
print("文件位置:和代码在同一个文件夹")
print('\n') 
#input("取读完毕,点击回车退出")

相关文章

网友评论

      本文标题:爬虫:爬取扇贝小组发帖

      本文链接:https://www.haomeiwen.com/subject/eezxoctx.html