加载单位数据,比如:支,盒等,这里根据自己业务数据自行整理:
unit_data=pd.read_excel("unit.xlsx")
print("单位数据:",len(unit_data))
unit_set = set(unit_data.name)
定义关键字,用于特殊处理:
key_set = set(("装","重"))
根据业务判断进行连续字判断:
def is_chinese(c):
return '\u4e00' <= c <= '\u9fff';
def char_test(c):
return c.isdigit() or (c >= u'\u0041' and c <= u'\u005a') or (c >= u'\u0061' and c <= u'\u007a') or c == '.' or c == '/' or (c in unit_set)
标准化实现:
def specification_extract(txt):
strs = []
key_indexes = []
begin = 0
end = len(txt)
index = txt.find("每",begin,end)
if index < 0:
strs.append(txt)
else:
if index > 0:
key_indexes.append(0)
while index > -1:
key_indexes.append(index)
begin = index + 1
index = txt.find("每",begin,end)
i = 0
while i < len(key_indexes):
if i + 1 < len(key_indexes):
strs.append(txt[key_indexes[i]:key_indexes[i+1]])
else:
strs.append(txt[key_indexes[i]:])
i = i+1
return strs
def specification_normalize(txt):
start_index = txt.find("每")
#print(start_index)
if start_index < 0:
return txt
unit_txt =""
if is_chinese(txt[start_index+1]) == True:
unit_txt = txt[start_index+1]
#print(unit_txt)
if txt[start_index+2] in key_set:
start_index += 3
else:
start_index += 2
else:
start_index += 1
i = start_index
find = False
find_index = 0
#print(i,txt[start_index:])
while i < len(txt) and find == False:
find = (char_test(txt[i]) == False)
#print(txt[i],find)
if find == False:
i += 1
#print(start_index,i)
result = txt[start_index:i] + "/" + unit_txt
if i < len(txt)-1:
result = result + txt[i:]
return result
def specification_all_normalize(txt):
if txt.find("每") < 0:
return txt
else:
return "".join(map(specification_normalize,specification_extract(txt)))
网友评论