美文网首页
Lucene+mmseg实现停用词分词器

Lucene+mmseg实现停用词分词器

作者: 明丶夷 | 来源:发表于2018-09-30 15:42 被阅读0次

    文中使用的Lucene版本为5.5.3,不再另行说明


    之前在用Lucene实现系统的全文检索功能时,使用的是名为mong-mmseg第三方分词器。该分词器是基于MMSeg 算法实现的中文分词器,目前应该是不再维护了。因为是第一次开发全文检索相关的功能,难免考虑不周,在进行功能测试的时候发现,对于你,我,的等字词也进行了匹配,导致返回了好多无意义的数据。经查阅资料,对这些无意义的字词有个专门的名称——停用词,可通过配置停用词来达到过滤无意义字词,提高搜索匹配度目的。
    mmseg分词器就不详细说了,只要知道其本身并没有提供停用词的过滤。而Lucene本身所提供的StopAnalyzer并没有满足中文分词的要求,以下是StopAnalyzer源码:

    public final class StopAnalyzer extends StopwordAnalyzerBase
    {
    
        public StopAnalyzer()
        {
            this(ENGLISH_STOP_WORDS_SET);
        }
    
        public StopAnalyzer(CharArraySet stopWords)
        {
            super(stopWords);
        }
    
        public StopAnalyzer(Path stopwordsFile)
            throws IOException
        {
            this(loadStopwordSet(stopwordsFile));
        }
    
        public StopAnalyzer(Reader stopwords)
            throws IOException
        {
            this(loadStopwordSet(stopwords));
        }
    
        protected org.apache.lucene.analysis.Analyzer.TokenStreamComponents createComponents(String fieldName)
        {
            org.apache.lucene.analysis.Tokenizer source = new LowerCaseTokenizer();
            return new org.apache.lucene.analysis.Analyzer.TokenStreamComponents(source, new StopFilter(source, stopwords));
        }
        public static final CharArraySet ENGLISH_STOP_WORDS_SET;
    
        static 
        {
            java.util.List stopWords = Arrays.asList(new String[] {
                "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", 
                "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", 
                "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", 
                "was", "will", "with"
            });
            CharArraySet stopSet = new CharArraySet(stopWords, false);
            ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
        }
    }
    

    可以很明白的看出StopAnalyzer的停用词是在静态代码块中定义的一个数组stopWords,当然StopAnalyzer提供了通过其他方式加载停用词的构造方法。使用的分词器是LowerCaseTokenizer。在构建TokenStreamComponents时,除了LowerCaseTokenizer,同时传入了一个StopFilter。那么仿照StopAnalyzer,来实现自己的停用词分词器。


    整体思路是这样的:创建自己的分词器类(本文命名为OwnStopAnalyzer),继承Lucene的Analyzer类,实现createComponents方法。从上面StopAnalyzer的代码可以看出,createComponents(String fieldName)方法是核心,过滤器和分词就是在这个方法里被决定的。这是详细代码:

     protected TokenStreamComponents createComponents(String fieldName)
    {
        MMSegTokenizer ownStopTokenizer= new MMSegTokenizer(newSeg());
        StopFilter stopFilter = new StopFilter(ownStopTokenizer,stopWordsSet);
        return new TokenStreamComponents(ownStopTokenizer,stopFilter);
    }
    

    这里使用的分词器是MMSegTokenizer;过滤器中使用的停用词库是stopWordsSet,类型为CharArraySet,在完整代码中可看到初始化过程。
    OwnStopAnalyzer的完整代码:

    import java.io.Closeable;
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.Reader;
    import java.nio.charset.StandardCharsets;
    import java.nio.file.Files;
    import java.nio.file.Path;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.core.StopFilter;
    import org.apache.lucene.analysis.util.CharArraySet;
    import org.apache.lucene.analysis.util.WordlistLoader;
    import org.apache.lucene.util.IOUtils;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.core.io.Resource;
    import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
    
    import monad.mmseg.Dictionary;
    import monad.mmseg.analysis.MMSegTokenizer;
    import monad.mmseg.Seg;
    import monad.mmseg.MaxWordSeg;
    
    
    public class OwnStopAnalyzer extends Analyzer{
        private static final Logger logger = LoggerFactory.getLogger(IndexUtil.class);
        
        protected Dictionary dic;
        
        private CharArraySet stopWordsSet;
        
        public OwnStopAnalyzer()
        {
            dic = Dictionary.getInstance();
        }
    
        protected Seg newSeg()
        {
            return new MaxWordSeg(dic);
        }
    
        public Dictionary getDict()
        {
            return dic;
        }
    
        /**
         * 
         * @param path 自定义的分词词库 
         * @param contentPath 自定义的停用词库
         * @throws IOException
         */
        public OwnStopAnalyzer(File path, String contentPath) throws IOException
        {
            this.stopWordsSet = loadStopwordSet(copyStopWords(contentPath));
            this.dic = Dictionary.getInstance(path);
        }
    
        protected TokenStreamComponents createComponents(String fieldName)
        {
            MMSegTokenizer ownStopTokenizer= new MMSegTokenizer(newSeg());
            StopFilter stopFilter = new StopFilter(ownStopTokenizer,stopWordsSet);
            return new TokenStreamComponents(ownStopTokenizer,stopFilter);
        }
    
        private CharArraySet loadStopwordSet(Path stopwords) throws IOException {
            Reader reader = null;
            CharArraySet chararrayset = null;
            logger.info("try to load " + stopwords.toString());
            reader = Files.newBufferedReader(stopwords, StandardCharsets.UTF_8);
            chararrayset = WordlistLoader.getWordSet(reader);
            IOUtils.close(new Closeable[] { reader });
            return chararrayset;
        }
        
        private Path copyStopWords(String contentPath) throws IOException {
            PathMatchingResourcePatternResolver pathMatchingResourcePatternResolver = new PathMatchingResourcePatternResolver();
            File folderFile = new File(contentPath + "/analyzerData/stop");
            if (!folderFile.exists()) {
                logger.info("创建本地停用词库");
                folderFile.mkdirs();
            }
            Resource[] resources = pathMatchingResourcePatternResolver.getResources("analyzerData/stop/*.*");
            for (Resource resource : resources) {
                // 获得文件流,因为在jar文件中,不能直接通过文件资源路径拿到文件,但是可以在jar包中拿到文件流
                InputStream stream = resource.getInputStream();
                File copyFile = new File(contentPath + "/analyzerData/stop/" + resource.getFilename());
                if (copyFile.exists()) {
                    continue;
                }
                FileUtils.copyInputStreamToFile(stream, copyFile);
                logger.info("复制系统停用词文件到本地");
            }
            File stopWords = new File(contentPath + "/analyzerData/stop/"+"stopwords.dic");
            return stopWords.toPath();
        }
    }
    

    loadStopwordSet(Path stopwords)copyStopWords(String contentPath)就是加载停用词库的过程。其中copyStopWords(String contentPath)是因为最后是打包成jar运行的,读取不到jar里的资源文件,提供的一个解决方法。若能直接获取资源文件的路径,可以弃用。停用词库资源文件的命名参照mmseg的方式:格式为*.dic

    相关文章

      网友评论

          本文标题:Lucene+mmseg实现停用词分词器

          本文链接:https://www.haomeiwen.com/subject/izswoftx.html