import regex as re
def number_translator(target):
'''
该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
如"一千两百,六百零五"可以转化为"1200,605"
此外添加支持了部分不规则表达方法:
如两万零六百五可转化为20650
两百一十四和两百十四都可以转化为214
一六零加一五八可以转化为160+158
该方法目前支持的正确转化范围是0-99999999
:param target: 待转化的字符串
:return: 转化完毕后的字符串
'''
def word2number(s):
'''
可将[零-九]正确翻译为[0-9]
:param s: 大写数字
:return: 对应的整形数,如果不是数字返回-1
'''
if (s == u'零') or (s == '0'):
return 0
elif (s == u'一') or (s == '1') or (s == u'壹'):
return 1
elif (s == u'二') or (s == '2') or (s == u'贰') or (s == u'两'):
return 2
elif (s == u'三') or (s == '3') or (s == u'叁'):
return 3
elif (s == u'四') or (s == '4') or (s == u'肆'):
return 4
elif (s == u'五') or (s == '5') or (s == u'伍'):
return 5
elif (s == u'六') or (s == '6') or (s == u'陆'):
return 6
elif (s == u'七') or (s == '7') or (s == u'柒') or (s == u'天') or (s == u'日') or (s == u'末'):
return 7
elif (s == u'八') or (s == '8') or (s == u'捌'):
return 8
elif (s == u'九') or (s == '9') or (s == u'玖'):
return 9
# elif (s == u'十') or (s == u'拾'):
# return 10
# elif (s == u'百') or (s == u'佰'):
# return 100
# elif (s == u'千') or (s == u'仟'):
# return 1000
# elif (s == u'万') or (s == u'萬'):
# return 10000
# elif (s == u'亿'):
# return 100000000
else:
return -1
def str2int(s):
'''
将字符数字转换为int
'''
try:
res = int(s)
except:
res = 0
return res
pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(亿|千|百|十))")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"万")
s = list(s)
num = 0
if len(s) == 2:
num += word2number(s[0]) * 10000 + word2number(s[1]) * 1000
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"万")
s = list(s)
num = 0
if len(s) == 2:
num += word2number(s[0]) * 10000 + word2number(s[1]) * 1000
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"千")
s = list(filter(None, s))
num = 0
if len(s) == 2:
num += word2number(s[0]) * 1000 + word2number(s[1]) * 100
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"百")
s = list(filter(None, s))
num = 0
if len(s) == 2:
num += word2number(s[0]) * 100 + word2number(s[1]) * 10
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[零一二两三四五六七八九]")
match = pattern.finditer(target)
for m in match:
target = pattern.sub(str(word2number(m.group())), target, 1)
# print(target)
pattern = re.compile(u"(?<=(周|星期|天|日))[天|日|末]")
match = pattern.finditer(target)
for m in match:
target = pattern.sub(str(word2number(m.group())), target, 1)
# print(target)
pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"十")
num = 0
ten = str2int(s[0])
if ten == 0:
ten = 1
unit = str2int(s[1])
num = ten * 10 + unit
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"百")
s = list(filter(None, s))
num = 0
if len(s) == 1:
hundred = int(s[0])
num += hundred * 100
elif len(s) == 2:
hundred = int(s[0])
num += hundred * 100
num += int(s[1])
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"千")
s = list(filter(None, s))
num = 0
if len(s) == 1:
thousand = int(s[0])
num += thousand * 1000
elif len(s) == 2:
thousand = int(s[0])
num += thousand * 1000
num += int(s[1])
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"万")
s = list(filter(None, s))
num = 0
if len(s) == 1:
tenthousand = int(s[0])
num += tenthousand * 10000
elif len(s) == 2:
tenthousand = int(s[0])
num += tenthousand * 10000
num += int(s[1])
target = pattern.sub(str(num), target, 1)
# print(target)
pattern = re.compile(u"[0-9]+亿[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?")
match = pattern.finditer(target)
for m in match:
group = m.group()
s = group.split(u"亿")
s = list(filter(None, s))
num = 0
if len(s) == 1:
tenthousand = int(s[0])
num += tenthousand * 100000000
elif len(s) == 2:
tenthousand = int(s[0])
num += tenthousand * 100000000
num += int(s[1])
target = pattern.sub(str(num), target, 1)
# print(target)
return target
例子:
target = '二十亿一百万一千零两百,二千零三十三,三个星期.三天,三十日,星期天,周末, 十万亿'
print(number_translator(target))
输出:
2001001200,2033,3个星期.3天,30日,星期7,周7, 10000000000000
网友评论