本章学习代码收录在
GitHub - jiajia154569836/Python: python实战笔记
注意:
1.需要先安装python
2.需安装依赖例:python -m pip install requests
3.踩到的坑
win10,python3.5 安装scrapy - python菜鸟 - 博客园
scrapy写爬虫是出现no module named win32api错误 - 不活在梦想里 - 博客园
0.彩蛋
使用Python画小猪佩奇(Python内置的turtle库) - CSDN博客
1.爬取酷狗top500
设计方案:
1.根据requests获取html
2.根据BeautifulSoup解析html
3.找到需要查找的文本的class使用选择器
4.存入Mongo,time的设置是为了降低爬去的速度防止存入与爬取不对等
因为我没有安装Mongo其中部分代码注释(直接打印到控制台了)
代码如下:
import time
import requests
from bs4import BeautifulSoup
from pymongoimport MongoClient
#client = MongoClient() # mongodb server
#songs = client.kugou_db.songs # song collection
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def get_info(url):
'''获取酷狗音乐TOP500信息'''
wb_data= requests.get(url, headers=headers)
soup= BeautifulSoup(wb_data.text, 'lxml')
ranks= soup.select('.pc_temp_num') # 排名list
titles= soup.select('.pc_temp_songlist > ul > li > a') # 名称list
song_times= soup.select('.pc_temp_time') # 歌曲时长list
for rank, title, song_timein zip(ranks, titles, song_times):
data= {
'rank': rank.get_text().strip(),
'singer': title.get_text().split('-')[0].strip(),
'song': title.get_text().split('-')[1].strip(),
'time': song_time.get_text().strip()
}
print(data)
# song_id = songs.insert(data) # insert db
#print(song_id)
print('---------------------------------')
if __name__== '__main__':
# 生成需要遍历的url
urls= ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for iin range(1, 24)]
for urlin urls:
get_info(url)
time.sleep(1)
2.爬取拉勾网招聘信息
import json
import math
import time
import pymongo
import requests
#client = pymongo.MongoClient('localhost',27017)
#mydb = client['mydb']
#lagou = mydb['lagou']
headers= {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Content-Length': '26',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'xxxxxxxxxxxxxxxxx',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
}
def get_page(url,params):
html= requests.post(url,data=params,headers=headers)
json_data= json.loads(html.text)
total_count= json_data['content']['positionResult']['totalCount']
page_number= math.ceil(total_count/15) if math.ceil(total_count/15)<30 else 30
get_info(url,page_number)
def get_info(url, page):
for pnin range(1,page+1):
params= {
'first':'false',
'pn':str(pn),
'kd':'Python'
}
try:
html= requests.post(url,data=params,headers=headers)
json_data= json.loads(html.text)
results= json_data['content']['positionResult']['result']
for resultin results:
infos= {
'businessZones': result['businessZones'],
'city': result['city'],
'companyFullName': result['companyFullName'],
'companyLabelList': result['companyLabelList'],
'companySize': result['companySize'],
'district': result['district'],
'education': result['education'],
'explain': result['explain'],
'financeStage': result['financeStage'],
'firstType': result['firstType'],
'formatCreateTime': result['formatCreateTime'],
'gradeDescription': result['gradeDescription'],
'imState': result['imState'],
'industryField': result['industryField'],
'jobNature': result['jobNature'],
'positionAdvantage': result['positionAdvantage'],
'salary': result['salary'],
'secondType': result['secondType'],
'workYear': result['workYear']
}
print('------------------')
print(infos)
# lagou.insert_one(infos)
time.sleep(2)
except requests.exceptions.ConnectionError:
pass
if __name__== "__main__":
url= 'https://www.lagou.com/jobs/positionAjax.json'
params= {
'first':'true',
'pn':'1',
'kd':'python'
}
get_page(url,params)
3.爬取淘宝商品信息
4.利用scrap爬虫抓取小猪短租网
scrapy startproject new
scrapy crawl new
代码查看git项目(文章顶部)
网友评论