2019年11月28日
一.基本元字符
image.png image.png image.png1.\w 任意语音单次字符 ,数字 ,下划线
2 . .的意思
3 ^开头 $ 结束
二.具体例子
1.匹配邮箱后缀是 @zhijieketang.com
import re
p1 = r'\w+@zhijieketang\.com'
p2 = r'^\w+@zhijieketang\.com$'
text = "Tony's email is tony_guan588@zhijieketang.com."
m = re.search(p1, text)
print(m) # 匹配
m = re.search(p2, text)
print(m) # 不匹配
email = 'tony_guan588@zhijieketang.com'
m = re.search(p2, email)
print(m) # 匹配
2.[]使用
import re
p = r'[Jj]ava'
# p = r'Java|java|JAVA'
m = re.search(p, 'I like Java and Python.')
print(m) # 匹配
m = re.search(p, 'I like JAVA and Python.')
print(m) # 不匹配
m = re.search(p, 'I like java and Python.')
print(m) # 匹配
2^ 取反操作 【他还有 匹配一行开头】
2.1 不是纯数字,有其他字符出现 r'[^0123456789]’ r'[^0-9]’ 等价
import re
p = r'[^0123456789]'
m = re.search(p, '1000')
print(m) # 不匹配
m = re.search(p, 'Python 3')
print(m) # 匹配
3.区间概念 []匹配的是一个字符
import re
m = re.search(r'[A-Za-z0-9]', 'A10.3')
print(m) # 匹配
# 0 1 2 5 6 7
m = re.search(r'[0-25-7]', 'A3489C')
print(m) # 不匹配
4.预定义字符使用
image.pngimport re
# p = r'[^0123456789]'
p = r'\D'
m = re.search(p, '1000')
print(m) # 不匹配
m = re.search(p, 'Python 3')
print(m) # 匹配
text = '你们好Hello'
m = re.search(r'\w', text)
print(m) # 匹配
5.量词使用
image.png5.1单个字符重复
import re
m = re.search(r'\d?', '87654321') # 出现数字一次
print(m) # 匹配字符'8'
m = re.search(r'\d?', 'ABC') # 出现数字零次
print(m) # 匹配字符''
m = re.search(r'\d*', '87654321') # 出现数字多次
print(m) # 匹配字符'87654321'
m = re.search(r'\d*', 'ABC') # 出现数字零次
print(m) # 匹配字符''
m = re.search(r'\d+', '87654321') # 出现数字多次
print(m) # 匹配字符'87654321'
m = re.search(r'\d+', 'ABC')
print(m) # 不匹配
m = re.search(r'\d{8}', '87654321') # 出现数字8次
print('8765432', m) # 匹配字符'87654321'
m = re.search(r'\d{8}', 'ABC')
print(m) # 不匹配
m = re.search(r'\d{7,8}', '87654321') # 出现数字8次
print(m) # 匹配字符'87654321'
m = re.search(r'\d{9,}', '87654321')
print(m) # 不匹配
默认都是贪婪量词 如需要在后面加? 变成惰性量词 (最少匹配)
# 使用贪婪量词
m = re.search(r'\d{5,8}', '87654321') # 出现数字8次
print(m) # 匹配字符'87654321'
# 使用惰性量词
m = re.search(r'\d{5,8}?', '87654321') # 出现数字5次
print(m) # 匹配字符'87654'
5.2 分组 多个字符重复(分组) 一个小括号就是一组
5.2.1一般用法
import re
p = r'(121){2}'
m = re.search(p, '121121abcabc')
print(m) # 匹配
print(m.group()) # 返回匹配字符串
print(m.group(1)) # 获得第一组内容 分组默认从1开始
print(m.groups()) # 获得所有组内容
p = r'(\d{3,4})-(\d{7,8})'
m = re.search(p, '010-87654321')
print(m) # 匹配
print(m.group()) # 返回匹配字符串
print(m.groups()) # 获得所有组内容 一个小括号就是1组
image.png
5.2.2 添加组名
import re
p = r'(?P<area_code>\d{3,4})-(?P<phone_code>\d{7,8})'
m = re.search(p, '010-87654321')
print(m) # 匹配
print(m.group()) # 返回匹配字符串
print(m.groups()) # 获得所有组内容
# 通过组编号返回组内容
print(m.group(1))
print(m.group(2))
# 通过组名返回组内容
print(m.group('area_code'))
print(m.group('phone_code'))
5.2.3反向应用分组 内部引用之前的分组
import re
# p = r'<([\w]+)>.*</([\w]+)>'
p = r'<([\w]+)>.*</\1>' # 使用反向引用
m = re.search(p, '<a>abc</a>')
print(m) # 匹配
m = re.search(p, '<a>abc</b>')
print(m) # 不匹配
5.2.4非捕获分组,整体内容输出 分组里面用?:开头(?:)
import re
s = 'img1.jpg,img2.jpg,img3.bmp'
# 捕获分组
p = r'\w+(\.jpg)'
mlist = re.findall(p, s)
print(mlist)
# 非捕获分组
p = r'\w+(?:\.jpg)'
mlist = re.findall(p, s)
print(mlist)
image.png
三.re模块函数使用
1.seach() match()
import re
p = r'\w+@zhijieketang\.com'
text = "Tony's email is tony_guan588@zhijieketang.com."
m = re.search(p, text)
print(m) # 匹配
m = re.match(p, text)
print(m) # 不匹配
email = 'tony_guan588@zhijieketang.com'
m = re.search(p, email)
print(m) # 匹配
m = re.match(p, email)
print(m) # 匹配
# match对象几个方法
print('match对象几个方法:')
print(m.group()) # tony_guan588@zhijieketang.com
print(m.start()) # 0
print(m.end()) # 29
print(m.span()) # (0, 29)
2.findall() finditer()
import re
p = r'[Jj]ava'
text = 'I like Java and java.'
match_list = re.findall(p, text)
print(match_list) # 匹配
match_iter = re.finditer(p, text)
for m in match_iter:
print(m.group())
image.png
3.按照匹配的字符串进行分割split()
import re
p = r'\d+'
text = 'AB12CD34EF'
clist = re.split(p, text)
print(clist)
clist = re.split(p, text, maxsplit=1)
print(clist)
clist = re.split(p, text, maxsplit=2)
print(clist)
image.png
4.匹配字符串替换sub()
import re
p = r'\d+'
text = 'AB12CD34EF'
repace_text = re.sub(p, ' ', text)
print(repace_text)
repace_text = re.sub(p, ' ', text, count=1)
print(repace_text)
repace_text = re.sub(p, ' ', text, count=2)
print(repace_text)
image.png
四.已编译的正则表达式,和第三部分区别 就是提高效率(先编译后重复使用),不需要在传入p值
image.png1.例子
import re
p = r'\w+@zhijieketang\.com'
regex = re.compile(p)
text = "Tony's email is tony_guan588@zhijieketang.com."
m = regex.search(text)
print(m) # 匹配
m = regex.match(text)
print(m) # 不匹配
p = r'[Jj]ava'
regex = re.compile(p)
text = 'I like Java and java.'
match_list = regex.findall(text)
print(match_list) # 匹配
match_iter = regex.finditer(text)
for m in match_iter:
print(m.group())
p = r'\d+'
regex = re.compile(p)
text = 'AB12CD34EF'
clist = regex.split(text)
print(clist)
repace_text = regex.sub(' ', text)
print(repace_text)
image.png
2.指定编码格式 默认是re.U
import re
text = '你们好Hello'
p = r'\w+'
regex = re.compile(p, re.U)
m = regex.search(text)
print(m) # 匹配
m = regex.match(text)
print(m) # 匹配
regex = re.compile(p, re.A)
m = regex.search(text)
print(m) # 匹配
m = regex.match(text)
print(m) # 不匹配
3.忽略大小写 re.I
import re
p = r'(java).*(python)'
regex = re.compile(p, re.I)
m = regex.search('I like Java and Python.')
print(m) # 匹配
m = regex.search('I like JAVA and Python.')
print(m) # 匹配
m = regex.search('I like java and Python.')
print(m) # 匹配
4.点元字符匹配换行符 re.DOTALL
import re
p = r'.+'
regex = re.compile(p)
m = regex.search('Hello\nWorld.')
print(m) # 匹配
regex = re.compile(p, re.DOTALL)
m = regex.search('Hello\nWorld.')
print(m) # 匹配
image.png
5.多行模式, 对 ^ $ 开始和结束会有影响
import re
p = r'^World'
regex = re.compile(p)
m = regex.search('Hello\nWorld.')
print(m) # 不匹配
regex = [re.compile(p, re.M)
m = regex.search('Hello\nWorld.')
print(m) # 匹配
image.png
6.详细模式,很方便阅读 re.VERBOSE
import re
p = """(java) #匹配java字符串
.* #匹配任意字符零或多个
(python) #匹配python字符串
"""
regex = re.compile(p, re.I | re.VERBOSE)
m = regex.search('I like Java and Python.')
print(m) # 匹配
m = regex.search('I like JAVA and Python.')
print(m) # 匹配
m = regex.search('I like java and Python.')
print(m) # 匹配
image.png
如果您发现本文对你有所帮助,如果您认为其他人也可能受益,请把它分享出去。
网友评论