from selenium import webdriver
import re
def geturl(urlname):
url="https://search.51job.com/list/010000,000000,0000,00,9,99,"+urlname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
brower = webdriver.Firefox()
brower.get(url)
pagesoures=brower.page_source#抓取网页源代码
restr="共(\\d+)条"#如果不带括号会输出全部,只要()内的数据如果政策抓取不到也许他前面有空格
rex=re.compile(restr,re.IGNORECASE)
mylist=rex.findall(pagesoures)
brower.close()
news=mylist[0].strip()#去掉前后空格
if len(mylist)==0:
return "失败"
else:
return news
# print(geturl("python"))
androids=["android开发","安卓开发","Android","Android实习生","软件测试"]
for ands in androids:
print(ands,geturl(ands))
urllib.request
import urllib.request
import urllib.error
def download(url):
response=urllib.request.urlopen(url,timeout=5)#timeout访问超时
print(type(response))#<class 'http.client.HTTPResponse'>
print(response.info())#包含了网站请求的详细信息
print(response.read())#读取原代码,可以传入几个直接如100等
try:
print(download("http://www.google.com"))
except urllib.error.URLError as e:#抓住错误变量类型当作变量
print("网络异常",e)
伪装
import urllib.request
def openUrl(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}#伪装浏览器(有的网站会限制此时就要用伪装)
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req) # 请求
html = response.read() # 获取
html = html.decode("utf-8") # 解码
print(html) # 打印
def openUrl2(url):
return urllib.request.urlopen(url).read()#读取全部网页
if __name__ == "__main__":
url = "http://www.bazhuayu.com/download" # 'http://www.douban.com/'
print(openUrl2(url))
openUrl(url)
20180715201515845.png
还可以冒充手机浏览器等
如果网站把我们屏蔽了可以通过伪装,调用浏览器2中解决
有的浏览网址需要传入中文比如百度....所以在这里我们要进行编码和解码
import urllib.request
url = "http://zzk.cnblogs.com/s?w=python"+ urllib.parse.quote("爬虫")+"&t=b"
print(urllib.parse.quote("爬虫"))#统一规范 编码
print(urllib.request.unquote(urllib.parse.quote("爬虫")))#解码
print(urllib.request.urlopen(url).read().decode("UTF-8"))#打印编码
网友评论