ls =[]
for word in df['name'].values:
result = jieba.tokenize(word)
wd=[]
start_index = []
end_index = []
for tk in result:
wd.append(tk[0])
start_index.append(tk[1])
end_index.append(tk[2])
data1 = {'wd':wd,
'start_index':start_index,
'end_index':end_index}
data1 = pd.DataFrame(data1)
keep_last_drop = data1.drop_duplicates(subset=['wd'],keep='last')#(按'word'列去重,保留后一项)注意!:这里的保留后一项keep=last很关键
keep_False_drop = data1.drop_duplicates(subset=['wd'],keep=False)#未重复的数据
dup_df=keep_last_drop.append(keep_False_drop).drop_duplicates(subset=['wd'],keep=False)#得出重复的数据
dup_df = dup_df.loc[dup_df.wd !='('].loc[dup_df.wd!=')']#不考虑重复的括号
dup_df=dup_df.reset_index()
# print(dup_df)#输出表格中的重复值位置
if len(dup_df)==0:
x=word
elif len(dup_df)==1:
x = word[:dup_df.loc[0,'start_index']]+word[dup_df.loc[0,'end_index']:]
elif len(dup_df) == 2:
if dup_df.loc[0,'end_index'] == dup_df.loc[1,'start_index']:
x = word[:dup_df.loc[0,'start_index']]+word[dup_df.loc[1,'end_index']:]
else:
x = word[:dup_df.loc[0,'start_index']]+word[dup_df.loc[0,'end_index']:dup_df.loc[1,'start_index']]+word[dup_df.loc[1,'end_index']:]
elif len(dup_df) ==3:
x = word[:dup_df.loc[0,'start_index']]+word[dup_df.loc[0,'end_index']:dup_df.loc[1,'start_index']]+word[dup_df.loc[1,'end_index']:dup_df.loc[2,'start_index']]+word[dup_df.loc[2,'end_index']:]
print('-------------------------------------------------------------------------------重复值为三个:\n',dup_df)
else:
print('-------------------------------------------------------------------------------重复值为四个及以上:\n',dup_df)
ls.append(x)
len(ls)
网友评论