关键词:正则表达式 | 缓存 | 性能优化
Python 3 的re
库中,对正则表达式的编译函数re.compile()
调用了私有函数re._compile()
,但更根本上编译的计算是由sre_compile.compile()
完成的,而re._compile()
中对编译好的表达式进行了缓存,使用_MAXCACHE
将缓存大小硬编码为512。以下是re._compile()
的源码,摘自:https://github.com/python/cpython/blob/3.5/Lib/re.py (3.6,3.7里也没有变化)
_MAXCACHE = 512
def _compile(pattern, flags):
# internal: compile pattern
try:
p, loc = _cache[type(pattern), pattern, flags]
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
return p
except KeyError:
pass
if isinstance(pattern, _pattern_type):
if flags:
raise ValueError(
"cannot process flags argument with a compiled pattern")
return pattern
if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags)
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
if p.flags & LOCALE:
if not _locale:
return p
loc = _locale.setlocale(_locale.LC_CTYPE)
else:
loc = None
_cache[type(pattern), pattern, flags] = p, loc
return p
在某些大规模应用场景下,512的缓存显然太小了一些,为了摆脱这个瓶颈但不去碰cpython的源码,我们可以自己改写re._compile()
,从而实现自定义缓存大小(max_regex_cache
),轻松排个10000出来。原函数里很多语句都不知道干嘛用的,但照葫芦画瓢总没错。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" RegexCompiler Class
Edited from python 3.5.4 re._compile() for regex caching.
"""
import sre_compile
import _locale
from pathlib import Path
import sys
p = Path(__file__).absolute()
sys.path.insert(0, str(p.parent.parent.parent))
from app_config import ALGO_CONFIG # ALGO_CONFIG['cache']['regex']里设置了缓存大小
class RegexCompiler(object):
def __init__(self):
# Enable the max cache size to cache all rules
self.max_regex_cache = int(ALGO_CONFIG['cache']['regex'])
# Other settings same as Python default RegexCompiler
self.cache = {} # 缓存
self.pattern_type = type(sre_compile.compile("", 0))
self.DEBUG_FLAG = sre_compile.SRE_FLAG_DEBUG
self.LOCALE_FLAG = sre_compile.SRE_FLAG_LOCALE
def compile(self, pattern, flags=0):
# internal: compile pattern
try:
p, loc = self.cache[type(pattern), pattern, flags]
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
return p
except KeyError:
pass
if isinstance(pattern, self.pattern_type):
if flags:
raise ValueError(
"cannot process flags argument with a compiled pattern")
return pattern
if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags)
if not (flags & self.DEBUG_FLAG):
if len(self.cache) >= self.max_regex_cache:
self.cache.clear()
if p.flags & self.LOCALE_FLAG:
if not _locale:
return p
loc = _locale.setlocale(_locale.LC_CTYPE)
else:
loc = None
self.cache[type(pattern), pattern, flags] = p, loc
return p
调用方法:
regex_compiler = RegexCompiler()
compiled = regex_compiler.compile(regex_str)
进一步优化是将这个类变成Singleton(之后我应该会专门写一篇),以及多模块共享。
网友评论