美文网首页elasticsearch
如何开发一个elasticsearch分词插件

如何开发一个elasticsearch分词插件

作者: tenlee | 来源:发表于2020-06-30 17:09 被阅读0次

    参考IK插件,如果开发一款简单的ES分词插件。github地址:https://github.com/tenlee2012/elasticsearch-analysis-demo

    项目配置

    1. 创建pom项目

    pom文件大概如下:

    <?xml version="1.0" encoding="UTF-8"?>
    
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <name>elasticsearch-analysis-demo</name>
        <modelVersion>4.0.0</modelVersion>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch-analysis-demo</artifactId>
        <version>${elasticsearch.version}</version>
        <packaging>jar</packaging>
        <description>Demo Custom Analyzer for Elasticsearch</description>
        <inceptionYear>2020</inceptionYear>
    
        <properties>
            <elasticsearch.version>7.7.1</elasticsearch.version>
            <jackson.version>2.10.4</jackson.version>
            <maven.compiler.target>1.8</maven.compiler.target>
            <elasticsearch.plugin.name>analysis-demo</elasticsearch.plugin.name>
            <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.demo.AnalysisDemoPlugin</elasticsearch.plugin.classname>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>org.elasticsearch</groupId>
                <artifactId>elasticsearch</artifactId>
                <version>${elasticsearch.version}</version>
               <!-- 此处的scope只用在provided就可以了,不参与打包 -->
                <scope> provided </scope>
            </dependency>
           
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-api</artifactId>
                <version>2.3</version>
            </dependency>
    
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
                <scope>test</scope>
            </dependency>
        </dependencies>
    
        <build>
            <resources>
                <resource>
                    <directory>src/main/resources</directory>
                    <filtering>false</filtering>
                    <excludes>
                        <!-- 这个文件是es插件描述文件,不用打包到jar包里面 -->
                        <exclude>plugin-descriptor.properties</exclude>
                    </excludes>
                </resource>
            </resources>
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.7.0</version>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
                    </configuration>
                </plugin>
                <!-- 把package的jar包在打包成zip文件,es插件是zip文件格式 -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-assembly-plugin</artifactId>
                    <version>3.1.0</version>
                    <configuration>
                        <finalName>analysis-http</finalName>
                        <appendAssemblyId>false</appendAssemblyId>
                        <outputDirectory>target</outputDirectory>
                        <descriptors>
                            <!-- 该插件的配置文件 -->
                            <descriptor>src/main/assembly/plugin.xml</descriptor>
                        </descriptors>
                    </configuration>
                    <executions>
                        <execution>
                            <phase>package</phase>
                            <goals>
                                <goal>single</goal>
                            </goals>
                        </execution>
                    </executions>
                </plugin>
    
                <!-- 配置文件复制,用于把 config目录下的文件复制到target目录下 -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-resources-plugin</artifactId>
                    <version>2.4.3</version>
                    <configuration>
                        <encoding>${project.build.sourceEncoding}</encoding>
                    </configuration>
                    <executions>
                        <execution>
                            <id>copy-spring-boot-resources</id>
                            <!-- here the phase you need -->
                            <phase>validate</phase>
                            <goals>
                                <!-- 资源文件配置 -->
                                <goal>copy-resources</goal>
                            </goals>
                            <configuration>
                                <encoding>utf-8</encoding>
                                <outputDirectory>${basedir}/target/config</outputDirectory>
                                <resources>
                                    <resource>
                                        <directory>${basedir}/config</directory>
                                        <includes>
                                            <include>*</include>
                                        </includes>
                                    </resource>
                                </resources>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>
            </plugins>
        </build>
    </project>
    

    2. assembly 插件配置

    文件路径src/main/assembly/plugin.xml

    <?xml version="1.0"?>
    <assembly>
        <id>analysis-http-release</id>
        <formats>
            <format>zip</format>
        </formats>
        <includeBaseDirectory>false</includeBaseDirectory>
        <fileSets>
            <fileSet>
                <directory>${project.basedir}/config</directory>
                <outputDirectory>config</outputDirectory>
            </fileSet>
        </fileSets>
    
        <files>
            <file>
                <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
                <outputDirectory/>
                <filtered>true</filtered>
            </file>
            <file>
                <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
                <outputDirectory/>
                <filtered>true</filtered>
            </file>
        </files>
        <dependencySets>
            <dependencySet>
                <outputDirectory/>
                <useProjectArtifact>true</useProjectArtifact>
                <useTransitiveFiltering>true</useTransitiveFiltering>
                <excludes>
                    <exclude>org.elasticsearch:elasticsearch</exclude>
                </excludes>
            </dependencySet>
            <dependencySet>
                <outputDirectory/>
                <useProjectArtifact>true</useProjectArtifact>
                <useTransitiveFiltering>true</useTransitiveFiltering>
                <includes>
                    <include>com.fasterxml.jackson.core:jackson-databind</include>
                </includes>
                <excludes>
                    <!-- 会和 es 自带的冲突 -->
                    <exclude>com.fasterxml.jackson.core:jackson-core</exclude>
                </excludes>
            </dependencySet>
        </dependencySets>
    </assembly>
    

    3. 插件描述文件

    resources目录下,plugin-descriptor.properties文件。

    该描述文件是给es校验和使用,参考https://www.elastic.co/guide/en/elasticsearch/plugins/master/plugin-authors.html#_plugin_descriptor_file

    字段 类型 描述
    description String simple summary of the plugin
    version String plugin’s version
    name String the plugin name
    classname String the name of the class to load, fully-qualified.
    java.version String version of java the code is built against. Use the system property java.specification.version. Version string must be a sequence of nonnegative decimal integers separated by "."'s and may have leading zeros.
    elasticsearch.version String 对应的 Elasticsearch 版本。

    ik插件plugin-descriptor.properties的配置是从pom.xml读取的properties配置,这样维护更方便,在打包时会替换掉占位符
    如下:

    # Elasticsearch plugin descriptor file
    # This file must exist as 'plugin-descriptor.properties' at
    # the root directory of all plugins.
    #
    # A plugin can be 'site', 'jvm', or both.
    #
    ### example site plugin for "foo":
    #
    # foo.zip <-- zip file for the plugin, with this structure:
    #   _site/ <-- the contents that will be served
    #   plugin-descriptor.properties <-- example contents below:
    #
    # site=true
    # description=My cool plugin
    # version=1.0
    #
    ### example jvm plugin for "foo"
    #
    # foo.zip <-- zip file for the plugin, with this structure:
    #   <arbitrary name1>.jar <-- classes, resources, dependencies
    #   <arbitrary nameN>.jar <-- any number of jars
    #   plugin-descriptor.properties <-- example contents below:
    #
    # jvm=true
    # classname=foo.bar.BazPlugin
    # description=My cool plugin
    # version=2.0.0-rc1
    # elasticsearch.version=2.0
    # java.version=1.7
    #
    ### mandatory elements for all plugins:
    #
    # 'description': simple summary of the plugin
    description=${project.description}
    #
    # 'version': plugin's version
    version=${project.version}
    #
    # 'name': the plugin name
    name=${elasticsearch.plugin.name}
    #
    # 'classname': the name of the class to load, fully-qualified.
    classname=${elasticsearch.plugin.classname}
    #
    # 'java.version' version of java the code is built against
    # use the system property java.specification.version
    # version string must be a sequence of nonnegative decimal integers
    # separated by "."'s and may have leading zeros
    java.version=${maven.compiler.target}
    #
    # 'elasticsearch.version' version of elasticsearch compiled against
    # You will have to release a new version of the plugin for each new
    # elasticsearch release. This version is checked when the plugin
    # is loaded so Elasticsearch will refuse to start in the presence of
    # plugins with the incorrect elasticsearch.version.
    elasticsearch.version=${elasticsearch.version}
    

    4. 权限声明文件

    resources目录下,文件名为plugin-security.policy

    jdk的安全策略限制,必须声明项目使用的权限

    grant {
      // needed because of the hot reload functionality
      permission java.net.SocketPermission "*", "accept,connect,resolve"; // 网络访问
      permission java.lang.RuntimePermission "getClassLoader"; // 部分插件需要,比如okhttp,fastjson
      permission java.net.NetPermission "getProxySelector"; // 网络访问
      permission java.lang.RuntimePermission "accessDeclaredMembers"; // 序列化和反序列化,比如jackson,fastjson
      permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; // 序列化和反序列化,比如jackson,fastjson
    };
    

    开发

    1. 插件入口类

    plugin-descriptor.properties文件的classname属性配置的插件入口。
    ik的入口类是org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin,比较简单,继承Plugin和实现了AnalysisPlugin接口,主声明了tokenizeranalyzer的名称,ik代码参考

    public class MyAnalysisPlugin extends Plugin implements AnalysisPlugin {
    
        @Override
        public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
            Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
    
            extra.put("demo_tokenizer", new AnalysisModule.AnalysisProvider<TokenizerFactory>() {
                @Override
                public TokenizerFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
                    return MyTokenizerFactory.getTokenizerFactory(indexSettings, environment, name, settings);
                }
            });
    
            return extra;
        }
        @Override
        public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
            Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
    
            extra.put("demo_analyzer", new AnalysisModule.AnalysisProvider() {
                @Override
                public Object get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
                    return MyAnalyzerProvider.getAnalyzerProvider(indexSettings, environment, name, settings);
                }
            });
            return extra;
        }
    }
    

    2. tokenizer怎么写

    MyTokenizerFactory

    public class MyTokenizerFactory extends AbstractTokenizerFactory {
    
        private MyConfiguration configuration;
    
        /**
         * 构造函数
         * @param indexSettings 索引配置
         * @param name 分析器或者分词器名称。如果是自定义分析器,则为自定义分析器名称
         * @param env es环境配置
         * @param settings 自定义分析器配置参数
         */
        public MyTokenizerFactory(IndexSettings indexSettings, String name, Environment env, Settings settings) {
            super(indexSettings,  settings, name);
            configuration = new MyConfiguration(indexSettings, name, env, settings);
        }
    
        public static TokenizerFactory getTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
            return new MyTokenizerFactory(indexSettings, name, environment, settings).setSmart(false);
        }
        
        @Override
        public Tokenizer create() {
            return new MyTokenizer(configuration);
        }
    }
    

    核心来了,他就是MyTokenizer
    MyTokenizer继承org.apache.lucene.analysis.Tokenizer,同时必须是final类型,不然启动会报错。
    代码示例如下

    public final class MyTokenizer extends Tokenizer {
        //词元文本属性
        private final CharTermAttribute termAtt;
        //词元位移属性
        private final OffsetAttribute offsetAtt;
        // 距离
        private final PositionIncrementAttribute positionAttr;
    
        /**
         * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零
         */
        private int totalOffset = 0;
    
        private AnalyzeContext analyzeContext;
    
        public MyTokenizer(Configuration configuration) {
            super();
            offsetAtt = addAttribute(OffsetAttribute.class);
            termAtt = addAttribute(CharTermAttribute.class);
            positionAttr = addAttribute(PositionIncrementAttribute.class);
    
            analyzeContext = new AnalyzeContext(input, configuration);
        }
    
        /**
         * @return 返会true告知还有下个词元,返会false告知词元输出完毕
         * @throws IOException
         */
        @Override
        public boolean incrementToken() throws IOException {
            this.clearAttributes();
    
            int position = 0;
            Term term;
            boolean unIncreased = true;
            do {
                term = analyzeContext.next();
                if (term == null) {
                    break;
                }
                if (TextUtility.isBlank(term.getText())) { // 过滤掉空白符,提高索引效率
                    continue;
                }
    
                ++position;
                unIncreased = false;
            } while (unIncreased);
    
            if (term != null) {
                positionAttr.setPositionIncrement(position);
                termAtt.setEmpty().append(term.getText());
                offsetAtt.setOffset(correctOffset(totalOffset + term.getOffset()),
                        correctOffset(totalOffset + term.getOffset() + term.getText().length()));
                return true;
            } else {
                totalOffset += analyzeContext.offset;
                return false;
            }
        }
        
        @Override
        public void end() throws IOException {
            super.end();
            offsetAtt.setOffset(totalOffset, totalOffset);
            totalOffset = 0;
        }
    
        /**
         * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
         */
        @Override
        public void reset() throws IOException {
            super.reset();
            analyzeContext.reset(new BufferedReader(this.input));
        }
    }
    

    分词类AnalyzeContext

    分词类负责读取文本,将文本分词,

    public class AnalyzeContext {
    
        /**
         * 输入
         */
        private Reader input;
        /**
         * 配置
         */
        private Configuration configuration;
        /**
         * 分词结果
         */
        private Iterator<Term> iterator;
        /**
         * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
         */
        int offset;
        /**
         * 缓冲区大小
         */
        private static final int BUFFER_SIZE = 4096;
        /**
         * 缓冲区
         */
        private char[] buffer = new char[BUFFER_SIZE];
        /**
         * 缓冲区未处理的下标
         */
        private int remainSize = 0;
    
        /**
         * 句子分隔符
         */
        private static final Set<Character> delimiterCharSet = new HashSet<Character>() {{
            add('\r');
            add('\n');
            add('。');
            add('!');
            add('!');
            add(',');
            add(',');
            add('?');
            add('?');
            add(';');
            add(';');
        }};
    
        public AnalyzeContext(Reader reader, Configuration configuration) {
            this.input = reader;
            this.configuration = configuration;
        }
    
        /**
         * 重置分词器
         *
         * @param reader
         */
        public void reset(Reader reader) {
            input = reader;
            offset = 0;
            iterator = null;
        }
    
        public Term next() throws IOException {
            // 如果当年迭代器中还有词,继续迭代
            if (iterator != null && iterator.hasNext()) {
                return iterator.next();
            }
            // 没词,读取下一行
            String line = readLine();
    
            if (line == null) {
                return null;
            }
    
            // todo 
            List<Term> termList = [你的分词算法].getTextTokenizer(line, configuration);
            // 分词结果是空
            if (termList.size() == 0) {
                return null;
            }
    
            for (Term term : termList) {
                term.setOffset(term.getOffset() + offset);
            }
            offset += line.length();
            iterator = termList.iterator();
            return iterator.next();
        }
    
        private String readLine() throws IOException {
            int offset = 0;
            int length = BUFFER_SIZE;
            // 上次读取剩下的部分
            if (remainSize > 0) {
                offset = remainSize;
                length -= remainSize;
            }
            // 读取的字符数,-1 读取结束
            int n = input.read(buffer, offset, length);
            if (n < 0) {
                if (remainSize != 0) {
                    String lastLine = new String(buffer, 0, remainSize);
                    remainSize = 0;
                    return lastLine;
                }
                return null;
            }
            n += offset;
    
            // 真正的句子结束位置
            int eos = lastIndexOfEos(buffer, n);
            String line = new String(buffer, 0, eos);
            remainSize = n - eos;
            if (remainSize > 0) {
                // 把剩下的复制到缓冲区开始位置
                System.arraycopy(buffer, eos, buffer, 0, remainSize);
            }
            return line;
        }
    
        /**
         * 根据句子分隔符,找到这一段文本中的最后一句话所在位置。
         *
         * @param buffer
         * @param length
         * @return
         */
        private int lastIndexOfEos(char[] buffer, int length) {
            if (length < BUFFER_SIZE) {
                return length;
            }
            for (int i = length - 1; i > 0; i--) {
                if (delimiterCharSet.contains(buffer[i])) {
                    return i + 1;
                }
            }
            return length;
        }
    }
    

    Term

    public class Term Serializable {
        //词元的起始位移
        private int offset;
        //词元的相对起始位置
        private int end;
        //词元文本
        private String text;
        //词元类型
        private String lexemeType;
    }
    

    打包&安装

    打包

    执行命令mvn clean package进行打包,target目录下会生成zip包。

    安装

    执行命令 bin/elasticsearch-plugin install [plugin_name],同意权限授权,即可。

    相关文章

      网友评论

        本文标题:如何开发一个elasticsearch分词插件

        本文链接:https://www.haomeiwen.com/subject/mkebtktx.html