python正则表达式学习

作者: 何者越清 | 来源:发表于2020-09-18 10:57 被阅读0次

python爬虫学习-day7-实战
Python 基础爬虫目录
python爬虫学习-day5-selenium
python爬虫学习-day6-ip池
python爬虫学习-day3-BeautifulSoup
python爬虫学习-day4-使用lxml+xpath提取内容
python爬虫学习-day2正则表达式
python爬虫学习-day1
正则表达式
正则表达式

re.search(r'','') 正则表达式搜索一个

. 除了换行符的所有符号

\. 表示.符号

| 或者

^ 开始换行后的开始位置

$ 结束换行后的结束位置

[...] 里面的字符都失去特殊作用 [\n] 匹配\n [^0-3] 0-3取反

{M,N} M<N 匹配前面正则M-N次

* 匹配前面正则0次或者多次，不能写在{}中

+ 匹配前面正则大于0次，不能写在{}中

？匹配前面正则0次或者1次，不能写在{}中

\序号

\A 开始位置

\Z 结束位置

\b 边界

\B 边界相反

\d 数字

\D 非数字

\w [a-zA-Z0-9]

\W \w取反

() 使用findall时，会单独在元祖列出来

(?:) 忽略单独列出来

[^a] 非a字符

使用

import re #导入正则库

ss = 't love fishc'

#搜索字符串

print(re.search(r'love',ss))

#数字

print(re.search(r'\d','i lova 233'))

re.search(r'[aeiou]','I love 22')

#ip

re.search(r'(([2][0-4]\d|25[0-5]|[01]\d\d|\d{0,1}\d)\.){3}([2][0-4]\d|25[0-5]|[01]\d\d|\d{0,1}\d)','192.168.2.22')

s='<\title><\html>'

re.search(r'<.+>',s)#贪婪模式 ">

re.search(r'<.+?>',s)#去贪婪模式 ">

#正则表达式模块

p = re.compile(r'[a-z]')

p.search('I love python')

p.findall('I love ss')

importurllib.request

import ssl

import os

defnew_request(url):

   ssl._create_default_https_context = ssl._create_unverified_context

   reqt = urllib.request.Request(url)

   reqt.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36')

   res = urllib.request.urlopen(reqt)

   html = res.read().decode('utf-8')

   returnhtml

defget_img(htmlstr):

   #p = r'

   #p = r'[^"]+\.jpg'

   p = r'

   #p = r'src="([^"]+\.jpg)"'

   print('p:',p)

   imglist = re.findall(p,htmlstr)

   pname ='nvshenba'

   ifos.path.exists(pname)==False:

os.mkdir(pname)

   os.chdir(pname)

   forimginimglist:

print(img)

filename=img.split('/')[-1]

urllib.request.urlretrieve(img,filename,None)

if__name__ =='__main__':

   '''

   #抓图

   url = 'https://tieba.baidu.com/p/5157570498'

   html = new_request(url)

   #print(len(html),html[:1000])

   get_img(html)

   '''

    #抓取ip地址

   purl ='http://www.goubanjia.com'

   pht = new_request(purl)

   print('pth len:',len(pht))

   #匹配所有ip，返回一个列表

   pr = r'(?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5]):?\d{0,4}'

   ps = re.findall(pr,'asdasd152.36.6.54:331afafaf12.36.5.5adsasd')

   print('ps',ps)

    pr = r'<td class="ip">(?:.*?(\d*)</span>){2}'

    pr = r'<td class="ip">.*?</td>'

   pslist = re.findall(pr,pht)

   pr2 = r'\d*\.|\.?\d+' #搜索ip的div区域

   core = re.compile(pr2)

foreach1inpslist:

pss = core.findall(each1)

ns =''

lasts =''

plen = len(pss)

hasrepeat =False

lasts =''

forindexinrange(plen):

   each = pss[index]

   ifindex>=plen-1:

ns +=':'+each

   eliflasts==each:

#if hasrepeat:

   #ns += each

hasrepeat =True

   else:

ns += each

   lasts=each

#print(pss)

print('ip:',ns)

   #print(pslist)