美文网首页
Elasticsearch实现模糊搜索、keyword忽略大小写

Elasticsearch实现模糊搜索、keyword忽略大小写

作者: AC编程 | 来源:发表于2021-10-11 16:31 被阅读0次

    目标

    1、ngram分词器Elasticsearch实现模糊搜索

    2、keyword忽略大小写

    一、代码

    Controller

        @PostMapping("createIndex")
        @ApiOperation(value="创建索引")
        public Result<Boolean> createIndex() throws Exception {
            Boolean is = esMemberService.createIndex("member");
            return Result.success(is);
        }
    

    Service

    @Override
        public Boolean createIndex(String index) {
            XContentBuilder setting = packageSetting();
            XContentBuilder mapping = packageMapping();
            return createIndexSetting(index,setting,mapping);
        }
    
       private XContentBuilder packageMapping(){
            XContentBuilder mapping = null;
            try {
                //创建索引Mapping
                mapping = XContentFactory.jsonBuilder()
                        .startObject()
                        .field("dynamic", true)
                        .startObject("properties")
                        //id
                        .startObject("id")
                        .field("type", "long")
                        .field("index", false)
                        .endObject()
                        //账号:keyword忽略大小写
                        .startObject("markId")
                        .field("type", "keyword")
                        .field("normalizer", "lowercase")
                        .endObject()
                        //昵称:模糊搜索、忽略大小写
                        .startObject("nickName")
                        .field("type", "text")
                        .field("analyzer", "ngram")
                        .endObject()
                        //头像
                        .startObject("iconUrl")
                        .field("type", "text")
                        .field("index", false)
                        .endObject()
                        //性别
                        .startObject("sex")
                        .field("type", "keyword")
                        .endObject()
                        .startObject("mobile")
                        .field("type", "keyword")
                        .endObject()
                        //经纬度
                        .startObject("location")
                        .field("type", "geo_point")
                        .endObject()
                        //地址
                        .startObject("address")
                        .field("type", "text")
                        .endObject()
                        .startObject("openMobile")
                        .field("type", "keyword")
                        .endObject()
                        //
                        .startObject("birthday")
                        .field("type", "date")
                        .field("format","yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis")
                        .endObject()
                        //
                        .startObject("createTime")
                        .field("type", "date")
                        .field("format","yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis")
                        .endObject()
                        .endObject()
                        .endObject();
            } catch (Exception e) {
                e.printStackTrace();
            }
            return mapping;
        }
    
    /**
         * ngram分词器配置
         * ngram:英文单词按字母分词
         * field("filter","lowercase"):大小写兼容搜索
         * index.max_ngram_diff: 允许min_gram、max_gram的差值
         * https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-ngram-tokenizer.html
         * normalizer:解决keyword区分大小写
         * https://www.elastic.co/guide/en/elasticsearch/reference/6.0/normalizer.html
         * @return
         */
        private XContentBuilder packageSetting() {
            XContentBuilder setting = null;
            try {
                //创建索引setting
                setting = XContentFactory.jsonBuilder()
                        .startObject()
                            .field("index.max_ngram_diff","5")
                            .startObject("analysis")
                                .startObject("analyzer")
                                    .startObject("ngram")
                                        .field("tokenizer","my_tokenizer")
                                        .field("filter","lowercase")
                                    .endObject()
                                .endObject()
                                .startObject("tokenizer")
                                    .startObject("my_tokenizer")
                                        .field("type","ngram")
                                        .field("min_gram","1")
                                        .field("max_gram","3")
                                    .endObject()
                                .endObject()
                                .startObject("normalizer")
                                    .startObject("lowercase")
                                        .field("type","custom")
                                        .field("filter","lowercase")
                                    .endObject()
                                .endObject()
                            .endObject()
                        .endObject();
            } catch (Exception e) {
                e.printStackTrace();
            }
            return setting;
        }
    
    
      protected Boolean createIndexSetting(String indexName, XContentBuilder settings,XContentBuilder mapping) {
            Boolean is = false;
            try {
                CreateIndexRequest request = buildCreateIndexRequest(indexName);
                if (settings != null) {
                    request.settings(settings);
                }
                if (mapping != null) {
                    request.mapping(mapping);
                }
                //获取索引客户端
                IndicesClient indices = client.indices();
                //创建索引
                CreateIndexResponse response = indices.create(request, COMMON_OPTIONS);
                log.info("是否所有节点都已确认请求: " + response.isAcknowledged());
                log.info("指示是否在超时之前为索引中的每个分片启动了必要数量的分片副本: " + response.isShardsAcknowledged());
                is = response.isAcknowledged();
            } catch (Exception e) {
                e.printStackTrace();
            }
            return is;
        }
    

    二、JSON格式

    PUT member
    {
      "settings": {
        "index.max_ngram_diff":"5"
        "analysis": {
          "analyzer": {
            "ngram": {
              "tokenizer": "my_tokenizer",
              "filter": "lowercase"
            }
          },
          "tokenizer": {
            "my_tokenizer": {
              "type": "ngram",
              "min_gram": 1,
              "max_gram": 3
            }
          }
        }
      }
    }
    

    三、参数说明

    3.1 filter:lowercase

    大小写兼容搜索,即字段内容为alan,搜索alanALANAlan都可以搜索出来。

    3.2 min_gram、max_gram

    根据min_gram以及max_gram指定切分时最小几个字符、最大几个字符。长度越短,切分出来越少,更多的被匹配到质量也越差;长度越长,切分出来越多,匹配越精确。

    min_gram为1,max_gram为1,对于Quick这个单词,就会变成[ Q,u,i,c,k]。按关键字Qui搜索,关键字就会被拆分成Q,ui三个字母去搜索,可能就会搜索出:QuickQueryyourlike等单词。

    min_gram为1,max_gram为3,对于Quick这个单词,就会变成[ Q, Qu, Qui, u, ui, uic, i, ic, ick, c, ck, k ]。按关键字Qui搜索,只会去匹配包含Qui的单词,因此搜索结果只有Quick

    3.3 index.max_ngram_diff

    min_gram默认值为1,max_gram默认值为2,min_grammax_gram的差值默认最大为1,如果设置值时差值大于1,需要先设置index.max_ngram_diff参数。

    四、参考资料

    一文带你彻底搞懂Elasticsearch中的模糊查询

    Elasticsearch 警惕使用 wildcard 检索!然后呢?

    NGram Tokenizer

    Elasticsearch Analysis 03 - Tokenizer

    相关文章

      网友评论

          本文标题:Elasticsearch实现模糊搜索、keyword忽略大小写

          本文链接:https://www.haomeiwen.com/subject/dxololtx.html