美文网首页
elasticsearch自定义分析器

elasticsearch自定义分析器

作者: 陈文瑜 | 来源:发表于2019-08-06 14:15 被阅读0次

    elasticsearch分析器三功能自定义

    • 字符过滤器(char_filter)
      首先,字符串按顺序通过每个字符过滤器,他们的任务是在分词前整理字符串
      一个字符过滤器可以用来去掉HTML,或者将&转化成and
      
    • 分词器(tokenizer)
      其次,字符串被分词器分爲单个的词条,一个简单的分词器遇到空格和标点的时候,可能会将文本拆分成词条
      Hello how are you?会被ES预设的分词器standard分成hello、how、are、you
      
    • Token 过滤器 (filter)
      最后,词条按顺序通过每个 token 过滤器,这个过程可能会改变词条(Quick -> quick)、
      删除词条(a、an、and、the...)、增加词条(jump和leap这种同义词)
      
    • 过滤器(filter)解析
      edge_ngram_filter:将每个词都进行进一步的切分,用于即时搜索(instant search)。
      `min_gram`表示只要用户搜索了一个字符我们就去进行匹配。
      `max_gram`表示匹配的最大长度,最大长度越长越占用空间
      pinyin_simple_filter:拼音首字母的过滤器
      pinyin_full_filter:拼音全拼的过滤器
      

    自定义分析器

    • 自定义setting格式
      PUT 127.0.0.1:9200/mytest
      {
          "setting": {
              "analysis": {
                  "char_filter": { 自定义的字符过滤器 },
                  "tokenizer": { 自定义的分词器 },
                  "filter": { 自定义的token过滤器 },
                  "analyzer": { 自定义的分析器,可以将上面的char_filter、tokenizer、filter用不同的组合拼起来,形成不同的分析器 }
              }
          }
      }
      

    实例

    #设置setting
    PUT /enterpriseextend
    {
      "settings": {
        "analysis": {
          "filter": {
            "edge_ngram_filter": {
              "type": "edge_ngram",
              "min_gram": 1,
              "max_gram": 50
            },
            "pinyin_simple_filter": {
              "type": "pinyin",
              "keep_first_letter": true,
              "keep_separate_first_letter": false,
              "keep_full_pinyin": false,
              "keep_original": false,
              "limit_first_letter_length": 50,
              "lowercase": true
            },
            "pinyin_full_filter": {
              "type": "pinyin",
              "keep_first_letter": false,
              "keep_separate_first_letter": false,
              "keep_full_pinyin": true,
              "none_chinese_pinyin_tokenize": true,
              "keep_original": false,
              "limit_first_letter_length": 50,
              "lowercase": true
            }
          },
          "tokenizer": {
            "ik_max_word": {
              "type": "ik_max_word",
              "use_smart": true
            }
          },
          "analyzer": {
            "ngramIndexAnalyzer": {
              "type": "custom",
              "tokenizer": "keyword",
              "filter": [
                "edge_ngram_filter",
                "lowercase"
              ]
            },
            "ikIndexAnalyzer": {
              "type": "custom",
              "tokenizer": "ik_max_word"
            },
            "pinyiSimpleIndexAnalyzer": {
              "tokenizer": "keyword",
              "filter": [
                "pinyin_simple_filter",
                "edge_ngram_filter",
                "lowercase"
              ]
            },
            "pinyiFullIndexAnalyzer": {
              "tokenizer": "keyword",
              "filter": [
                "pinyin_full_filter",
                "lowercase"
              ]
            }
          }
        }
      }
    }
    #设置mapping
    PUT enterpriseextend/_mapping/enterpriseextend
    {
      "properties": {
        "id": {
          "type": "long"
        },
        "entName": {
          "type": "text", 
          "analyzer": "ikIndexAnalyzer",
          "fields": {
            "ngram": {
              "type": "text", 
              "analyzer": "ngramIndexAnalyzer"
            },
            "SPY": {
              "type": "text", 
              "analyzer": "pinyiSimpleIndexAnalyzer"
            },
            "FPY": {
              "type": "text", 
              "analyzer": "pinyiFullIndexAnalyzer"
            }
          }
        }
      }
    }
    #插入语句
    PUT enterpriseextend/_doc/1
    {
      "entName":"确实不是啥好东西"
    }
    #三种查询
    GET enterpriseextend/_search
    {
      "query": {
        "match": {
          "entName.ngram": "确实不是啥好东西"
        }
      }
    }
    GET enterpriseextend/_search
    {
      "query": {
        "match": {
          "entName.SPY": "qsbsshdx"
        }
      }
    }
    GET enterpriseextend/_search
    {
      "query": {
        "match": {
          "entName.FPY": "queshibushishahaodongxi"
        }
      }
    }
    

    综合查询

    GET enterpriseextend/_search
    {
      "query": {
        "bool": {
          "should": [
            {
              "match": {
                "entName.ngram": {
                  "query": "确实不是啥好东西",
                  "boost": 5 
                }
              }
            },
            {
              "match": { 
                "entName.SPY": {
                  "query": "qsbsshdx",
                  "boost": 1 
                }
              }
            },
            {
              "match": { 
                "entName.FPY": {
                  "query": "queshibushishahaodongxi",
                  "boost": 0.8
                }
              }
            }
          ]
        }
      }
    }
    

    相关文章

      网友评论

          本文标题:elasticsearch自定义分析器

          本文链接:https://www.haomeiwen.com/subject/gpwidctx.html