全角转半角
def sbc_to_dbc(ustring):
ss = []
for s in ustring:
rstring = ""
for uchar in s:
inside_code = ord(uchar)
if inside_code == 12288: # 全角空格直接转换
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化
inside_code -= 65248
rstring += chr(inside_code)
ss.append(rstring)
return ''.join(ss)
替换内容,此处实际应用中可以根据规则定义在配置文件,或者数据库表中。type为1,表示文本替换,type为2表示正则替换,注意的是,这个替换规则是有处理顺序的
replaces =[
{"src":"毫克","type":1,"dest":"mg"},
{"src":"克","type":1,"dest":"g"},
{"src":"毫升","type":1,"dest":"ml"},
{"src":"升","type":1,"dest":"l"},
{"src":"公斤","type":1,"dest":"kg"},
{"src":"。","type":1,"dest":" "},
{"src":",","type":1,"dest":" "},
{"src":"x","type":1,"dest":"*"},
{"src":"×","type":1,"dest":"*"},
{"src":" +","type":2,"dest":" "}
]
def text_replace(txt):
for item in replaces:
replace_type = item["type"]
src = item["src"]
dest = item["dest"]
if replace_type == 1:
txt = txt.replace(src,dest)
elif replace_type == 2:
txt = re.sub(src,dest,txt)
return txt
标准化方法,移除两边空格和换行符,并转换成小写
def normalize(txt):
return text_replace(sbc_to_dbc(txt.strip().lower()))
网友评论