用户商品信息和标准库匹配是常见的业务,下面的例子(含代码,非正式,只供参考),提供了一种根据商品的批准文号,利用标准库数据,匹配用户商品的条码的一种方法。
数据基本结构和状况
1:标准数据由于是其他方提供,为了维护方便,标准数据保持不变
2:根据标准数据因素,创建对应的自己的标准数据库表,这张表中只包含需要标准化的信息字段。根据前文 https://www.jianshu.com/p/f646dec59f5d,提到的思路进行标准化数据处理
3:条码作为主键,条码与批准文号是一对多的关系。所以在本需要中,需要找到唯一一个匹配的商品条码
匹配基本逻辑和规则
1:假如存在批准文号对应的唯一条形码,则直接匹配成功
2:假如存在多个,则分析出优先级规则,降级逐步处理,直到最后找的唯一的一个匹配条形码,或者在所有规则中不符合
以下是python代码,仅供参考
import pandas as pd
import re
import sqlalchemy
from sqlalchemy import text
replaces =[
{"src":"毫克","type":1,"dest":"mg"},
{"src":"克","type":1,"dest":"g"},
{"src":"毫升","type":1,"dest":"ml"},
{"src":"升","type":1,"dest":"l"},
{"src":"公斤","type":1,"dest":"kg"},
{"src":"。","type":1,"dest":" "},
{"src":",","type":1,"dest":" "},
{"src":"x","type":1,"dest":"*"},
{"src":"×","type":1,"dest":"*"},
{"src":" +","type":2,"dest":" "}
]
def sbc_to_dbc(ustring):
ss = []
for s in ustring:
rstring = ""
for uchar in s:
inside_code = ord(uchar)
if inside_code == 12288: # 全角空格直接转换
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化
inside_code -= 65248
rstring += chr(inside_code)
ss.append(rstring)
return ''.join(ss)
def text_replace(txt):
for item in replaces:
replace_type = item["type"]
src = item["src"]
dest = item["dest"]
if replace_type == 1:
txt = txt.replace(src,dest)
elif replace_type == 2:
txt = re.sub(src,dest,txt)
return txt
def normalize(txt):
return text_replace(sbc_to_dbc(txt.strip().lower()))
def update(conn,k,row):
conn.execute(text('UPDATE goods_src_data SET standard_goods_id=:id_norm WHERE id=:id'),{"id":int(k),"id_norm":int(row.id_norm)})
conn_str = ""
src_data = pd.read_sql(sql="SELECT a.*,b.id AS id_norm,b.name AS name_norm,b.form AS form_norm,b.product_manufacturer AS product_manufacturer_norm FROM `goods_src_data` a JOIN goods_norm b ON a.approval_number=b.approval_number WHERE a.approval_number<>'' AND a.approval_number<>'-' AND standard_goods_id=0 ORDER BY a.id",con=conn_str)
print(len(src_data))
group_data = src_data.groupby("id")
print(len(group_data))
engine = sqlalchemy.create_engine(conn_str)
conn = engine.connect()
counter=0
j=0
d={}
for k,v in group_data:
if len(v)==1:
row = v.iloc[0]
#print(row)
conn.execute(text('UPDATE goods_src_data SET standard_goods_id=:id_norm WHERE id=:id'),{"id":int(k),"id_norm":int(row.id_norm)})
else:
filter_data = v[((v["name"]==v["name_norm"]) | (v.generic_name==v["name_norm"])) & (v.specification==v.form_norm) & (v.goods_manufacturer_name==v.product_manufacturer_norm)]
l = len(filter_data)
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=filter_data.iloc[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
if l!=1:
filter_data = v[((v["name"]==v["name_norm"]) | (v.generic_name==v["name_norm"])) & (v.specification==v.form_norm)]
l = len(filter_data)
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=filter_data.iloc[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
if l!=1:
filter_data = v[(v.specification==v.form_norm)]
l = len(filter_data)
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=filter_data.iloc[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
if l!=1:
filter_data = v
arr = []
for i,r in filter_data.iterrows():
if (r.specification in r.form_norm) or (r.form_norm in r.specification):
arr.append(r)
if len(arr)>1:
break
l = len(arr)
if l>0:
j+=1
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=arr[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
arr = []
for i,r in filter_data.iterrows():
if r.specification in r.form_norm:
arr.append(r)
if len(arr)>1:
break
l = len(arr)
if l>0:
j+=1
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=arr[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
for i,r in filter_data.iterrows():
if r.form_norm in r.specification:
arr.append(r)
if len(arr)>1:
break
l = len(arr)
if l>0:
j+=1
if l==1:
##print(filter_data.id,filter_data.id_norm)
r=arr[0]
d[r.id]=r.id_norm
update(conn,r.id,r)
counter+=1
conn.close()
print("counter:",counter,"j:",j)
print(len(d))
网友评论