1 赛事背景
问答系统中包括三个主要的部分:问题理解,信息检索和答案抽取。而问题理解是问答系统的第一部分也是非常关键的一部分。问题理解有非常广泛的应用,如重复评论识别、相似问题识别等。
重复问题检测是一个常见的文本挖掘任务,在很多实际问答社区都有相应的应用。重复问题检测可以方便进行问题的答案聚合,以及问题答案推荐,自动QA等。由于中文词语的多样性和灵活性,本赛题需要选手构建一个重复问题识别算法。
2 赛事任务
本次赛题希望参赛选手对两个问题完成相似度打分。
训练集:约5千条问题对和标签。若两个问题是相同的问题,标签为1;否则为0。
测试集:约5千条问题对,需要选手预测标签。
3 评审规则
1. 数据说明
训练集给定问题对和标签,使用\t进行分隔。测试集给定问题对,使用\t进行分隔。
eg:世界上什么东西最恐怖 世界上最恐怖的东西是什么? 1
解析:“世界上什么东西最恐怖”与”世界上最恐怖的东西是什么“问题相同,故是重复问题,标签为1。
2. 评估指标
本次竞赛的评价标准采用准确率指标,最高分为1。计算方法参考https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html,评估代码参考:
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
accuracy_score(y_true, y_pred)
4 特征工程
1 基础特征
# 文本长度特征
data['q1_len']=data['q1'].astype(str).map(len)
data['q2_len']=data['q2'].astype(str).map(len)
# 长度差特征:差/比例
data['q1q2_len_diff']=data['q1_len']-data['q2_len']
data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len'])
data['q1q2_rate']=data['q1_len']/data['q2_len']
data['q2q1_rate']=data['q2_len']/data['q1_len']
## 特殊符号特征
data['q1_end_special']=data['q1'].str.endswith('?').astype(int)
data['q2_end_special']=data['q2'].str.endswith('?').astype(int)
2 共现字特征
data['comm_q1q2char_nums']=data.apply(lambda row:len(set(row['q1'])&set(row['q2'])),axis=1)
# 共现字位置
def char_match_pos(q1, q2, pos_i):
q1 = list(q1)
q2 = list(q2)
if pos_i < len(q1):
q2_len = min(len(q2), 25) # q2_len只匹配前25个字
for pos_j in range(q2_len):
if q1[pos_i] == q2[pos_j]:
q_pos = pos_j + 1 # 如果匹配上了 记录匹配的位置
break
elif pos_j == q2_len - 1:
q_pos = 0 # 如果没有匹配上 赋值为0
else:
q_pos = -1 # 如果后续长度不存在 赋值为-1
return q_pos
for pos_i in range(8):
data['q1_pos_' + str(pos_i + 1)] = data.apply(
lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)
这里也可以用结巴分词,改成“词”粒度的
3 距离特征
print("===========距离特征 =============")
sim_func_dict = {"jaccard": distance.jaccard,
"sorensen": distance.sorensen,
"levenshtein": distance.levenshtein,
"ratio": Levenshtein.ratio
}
for sim_func in tqdm(sim_func_dict, desc="距离特征"):
data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1)
qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]]
for qt_len in qt:
if qt_len[0] == 3 and sim_func == "levenshtein":
pass
else:
data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply(
lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]],
row["q2"][:qt_len[1]]),
axis=1)
4 文本向量匹配特征
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \
minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \
seuclidean, sqeuclidean
from tqdm import tqdm
tqdm.pandas()
# 计算词向量的相似度
def get_w2v(query, title, num):
q = np.zeros(100)
count = 0
for w in query:
if w in w2v_model.wv:
q += w2v_model.wv[w]
count += 1
if count == 0:
query_vec = q
query_vec = (q / count).tolist()
t = np.zeros(100)
count = 0
for w in title:
if w in w2v_model.wv:
t += w2v_model.wv[w]
count += 1
if count == 0:
title_vec = q
title_vec = (t / count).tolist()
if num == 1:
try:
vec_cosine = cosine(query_vec, title_vec)
return vec_cosine
except Exception as e:
return 0
if num == 2:
try:
vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
return vec_canberra
except Exception as e:
return 0
if num == 3:
try:
vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
return vec_cityblock
except Exception as e:
return 0
if num == 4:
try:
vec_euclidean = euclidean(query_vec, title_vec)
return vec_euclidean
except Exception as e:
return 0
if num == 5:
try:
vec_braycurtis = braycurtis(query_vec, title_vec)
return vec_braycurtis
except Exception as e:
return 0
if num == 6:
try:
vec_minkowski = minkowski(query_vec, title_vec)
return vec_minkowski
except Exception as e:
return 0
if num == 7:
try:
vec_correlation = correlation(query_vec, title_vec)
return vec_correlation
except Exception as e:
return 0
if num == 8:
try:
vec_chebyshev = chebyshev(query_vec, title_vec)
return vec_chebyshev
except Exception as e:
return 0
if num == 9:
try:
vec_jensenshannon = jensenshannon(query_vec, title_vec)
return vec_jensenshannon
except Exception as e:
return 0
if num == 10:
try:
vec_mahalanobis = mahalanobis(query_vec, title_vec)
return vec_mahalanobis
except Exception as e:
return 0
if num == 11:
try:
vec_seuclidean = seuclidean(query_vec, title_vec)
return vec_seuclidean
except Exception as e:
return 0
if num == 12:
try:
vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
return vec_sqeuclidean
except Exception as e:
return 0
# 词向量的相似度特征
data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1),
axis=1)
data['vec_canberra'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1)
data['vec_cityblock'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1)
data['vec_euclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1)
data['vec_braycurtis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1)
data['vec_minkowski'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1)
data['vec_correlation'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1)
data['vec_chebyshev'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1)
data['vec_jensenshannon'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1)
data['vec_mahalanobis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1)
data['vec_seuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1)
data['vec_sqeuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1)
data['vec_cosine'] = data['vec_cosine'].astype('float32')
data['vec_canberra'] = data['vec_canberra'].astype('float32')
data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
data['vec_correlation'] = data['vec_correlation'].astype('float32')
5 向量特征
def w2v_sent2vec(words):
"""计算句子的平均word2vec向量, sentences是一个句子, 句向量最后会归一化"""
M = []
for word in words:
try:
M.append(w2v_model.wv[word])
except KeyError: # 不在词典里
continue
M = np.array(M)
v = M.sum(axis=0)
return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist()
fea_names = ['q1_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1)
fea_names = ['q2_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list']), result_type='expand', axis=1)
5 模型训练
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'num_leaves': 5,
'max_depth': 6,
'min_data_in_leaf': 450,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001, # 越小l2正则程度越高
'min_gain_to_split': 0.2,
}
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)],
eval_metric='binary_logloss',
verbose=50, early_stopping_rounds=200)
y_pred_valid = model.predict(X_valid)
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
oof[valid_index] = y_pred_valid.reshape(-1, )
prediction += y_pred
prediction /= n_fold
线下分数为
from sklearn.metrics import accuracy_score
y_pred = (oof > 0.5)
# score=accuracy_score(np.round(abs(oof)) ,train['label'].values)
score=accuracy_score(y_pred ,train['label'].values)
score
0.839,线上0.8406,线上和线下比较吻合
网友评论