1.背景
敏感词过滤是一个网站必不可少的功能,特别是用户能自由发言的需求。最近刚好有这样的需求,所以参考大家的方法有下面的实现,希望对大家有帮助
2.实现
2.1定义敏感词接口
import java.util.Collection;
import java.util.Optional;
import java.util.Set;
/**
* 敏感词接口
*
* @author lieber
*/
public interface ISensitiveWords {
/**
* 初始化敏感词库
*
* @param keywords 所有敏感词
*/
void init(Collection<String> keywords);
/**
* 添加敏感词
*
* @param keyword 待添加的敏感词
*/
void add(String keyword);
/**
* 检查敏感词
*
* @param words 待检查文本
* @return 是否包含敏感词
*/
boolean contain(String words);
/**
* 检查第一个敏感词
*
* @param words 待检查文本
* @return 检查到的第一个敏感词
*/
Optional<String> first(String words);
/**
* 所有包含的敏感词
*
* @param words 待检查文本
* @return 敏感词集合
*/
Set<String> all(String words);
}
2.2 基于DFA算法的实现
import com.yugioh.core.util.words.sensitive.ISensitiveWords;
import java.util.*;
/**
* 敏感词处理 -- DFA算法
*
* @author lieber
*/
public class DfaSensitiveWords implements ISensitiveWords {
private final static String END_MARK_KEY = "end";
private final static int MIN_MATCH_FLAG = 2;
private HashMap wordsMap;
@Override
public void init(Collection<String> keywords) {
if (keywords == null || keywords.isEmpty()) {
throw new IllegalArgumentException("The param of keywords cannot be empty");
}
wordsMap= new HashMap(keywords.size());
for (String keyword : keywords) {
if (keyword == null) {
continue;
}
Map nowMap = wordsMap;
char[] chars = keyword.toCharArray();
for (int i = 0; i < chars.length; i++) {
char keyChar = chars[i];
Object wordMap = nowMap.get(keyChar);
if (wordMap != null) {
//如果存在该key,直接赋值
nowMap = (Map) wordMap;
} else {
//不存在则,则构建一个map,同时将end设置为false,因为他不是最后一个
Map<String, Object> newWorMap = new HashMap<>(4);
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == chars.length - 1) {
//最后一个
nowMap.put(END_MARK_KEY, true);
}
}
}
}
@Override
public void add(String keyword) {
if (wordsMap== null) {
throw new IllegalArgumentException("Please initialize first");
}
}
@Override
public boolean contain(String words) {
for (int i = 0; i < words.length(); i++) {
int matchFlag = this.check(words, i);
if (matchFlag > 0) {
return true;
}
}
return false;
}
@Override
public Optional<String> first(String words) {
for (int i = 0; i < words.length(); i++) {
int length = this.check(words, i);
if (length > 0) {
return Optional.of(words.substring(i, i + length));
}
}
return Optional.empty();
}
@Override
public Set<String> all(String words) {
Set<String> sensitiveWordList = new HashSet<>();
for (int i = 0; i < words.length(); i++) {
int length = check(words, i);
if (length > 0) {
sensitiveWordList.add(words.substring(i, i + length));
i = i + length - 1;
}
}
return sensitiveWordList;
}
/**
* 检查文字中是否包含敏感字符
*
* @param words 文本
* @param beginIndex 开始位置
* @return 如果存在,则返回敏感词字符的长度,不存在返回0
*/
private int check(String words, int beginIndex) {
boolean flag = false;
int matchFlag = 0;
Map nowMap = wordsMap;
for (int i = beginIndex; i < words.length(); i++) {
char word = words.charAt(i);
nowMap = (Map) nowMap.get(word);
if (nowMap == null) {
break;
}
matchFlag++;
flag = Boolean.TRUE.equals(nowMap.get(END_MARK_KEY));
}
return matchFlag >= MIN_MATCH_FLAG && flag ? matchFlag : 0;
}
}
网友评论