美文网首页
Elasticsearch拼音分词elasticsearch-a

Elasticsearch拼音分词elasticsearch-a

作者: 懒人程序猿 | 来源:发表于2020-05-29 19:16 被阅读0次
    安装elasticsearch-analysis-pinyin插件

    根据Elasticsearch版本选择对应的插件版本安装,下面是以6.8.0为例
    拼音分词插件:https://github.com/medcl/elasticsearch-analysis-pinyin/releases
    进入Elasticsearch安装目录下的bin目录,执行下面命令安装

    ./elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v6.8.0/elasticsearch-analysis-pinyin-6.8.0.zip
    

    查看已安装插件

    ./elasticsearch-plugin list
    

    卸载插件

    ./elasticsearch-plugin remove 插件名称
    

    安装完成后重启Elasticsearch

    分词分析查看

    GET /_analyze
    {
      "text":"科技创新",
      "analyzer":"pinyin"
    }
    

    结果如下

    {
      "tokens" : [
        {
          "token" : "ke",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "kjcx",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "ji",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "chuang",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "xin",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 3
        }
      ]
    }
    
    使用自定义分词

    创建拼音索引分词

    PUT /medcl/ 
    {
        "index" : {
            "analysis" : {
                "analyzer" : {
                    "pinyin_analyzer" : {
                        "tokenizer" : "my_pinyin"
                        }
                },
                "tokenizer" : {
                    "my_pinyin" : {
                        "type" : "pinyin",
                        "keep_separate_first_letter" : false,
                        "keep_full_pinyin" : true,
                        "keep_original" : true,
                        "limit_first_letter_length" : 16,
                        "lowercase" : true,
                        "remove_duplicated_term" : true
                    }
                }
            }
        }
    }
    

    测试分词器,分析一个中文名称,如:刘德华

    GET /medcl/_analyze
    {
      "text": ["刘德华"],
      "analyzer": "pinyin_analyzer"
    }
    

    结果如下

    {
      "tokens" : [
        {
          "token" : "liu",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "刘德华",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "ldh",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "de",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "hua",
          "start_offset" : 0,
          "end_offset" : 0,
          "type" : "word",
          "position" : 2
        }
      ]
    }
    

    创建映射

    POST /medcl/folks/_mapping 
    {
        "folks": {
            "properties": {
                "name": {
                    "type": "keyword",
                    "fields": {
                        "pinyin": {
                            "type": "text",
                            "store": false,
                            "term_vector": "with_offsets",
                            "analyzer": "pinyin_analyzer",
                            "boost": 10
                        }
                    }
                }
            }
        }
    }
    

    添加文档

    POST /medcl/folks/andy 
    {
        "name":"刘德华"
    }
    

    搜索测试

    GET /medcl/folks/_search
    {
        "query": {
            "match": {
                "name.pinyin": "liu"
            }
        }
    }
    

    搜索结果如下

    {
        "took": 23,
        "timed_out": false,
        "_shards": {
            "total": 5,
            "successful": 5,
            "skipped": 0,
            "failed": 0
        },
        "hits": {
            "total": 1,
            "max_score": 3.439677,
            "hits": [{
                "_index": "medcl",
                "_type": "folks",
                "_id": "andy",
                "_score": 3.439677,
                "_source": {
                    "name": "刘德华"
                }
            }]
        }
    }
    

    自定义拼音首字母分词

    PUT /medcl1/ 
    {
        "index" : {
            "analysis" : {
                "analyzer" : {
                    "user_name_analyzer" : {
                        "tokenizer" : "whitespace",
                        "filter" : "pinyin_first_letter_and_full_pinyin_filter"
                    }
                },
                "filter" : {
                    "pinyin_first_letter_and_full_pinyin_filter" : {
                        "type" : "pinyin",
                        "keep_first_letter" : true,
                        "keep_full_pinyin" : false,
                        "keep_none_chinese" : true,
                        "keep_original" : false,
                        "limit_first_letter_length" : 16,
                        "lowercase" : true,
                        "trim_whitespace" : true,
                        "keep_none_chinese_in_first_letter" : true
                    }
                }
            }
        }
    }
    

    测试分词器

    GET /medcl1/_analyze
    {
      "text": ["刘德华 张学友 郭富城 黎明 四大天王"],
      "analyzer": "user_name_analyzer"
    }
    

    结果

    {
        "tokens": [{
                "token": "ldh",
                "start_offset": 0,
                "end_offset": 3,
                "type": "word",
                "position": 0
            },
            {
                "token": "zxy",
                "start_offset": 4,
                "end_offset": 7,
                "type": "word",
                "position": 1
            },
            {
                "token": "gfc",
                "start_offset": 8,
                "end_offset": 11,
                "type": "word",
                "position": 2
            },
            {
                "token": "lm",
                "start_offset": 12,
                "end_offset": 14,
                "type": "word",
                "position": 3
            },
            {
                "token": "sdtw",
                "start_offset": 15,
                "end_offset": 19,
                "type": "word",
                "position": 4
            }
        ]
    }
    

    设置映射

    POST /medcl1/folks/_mapping 
    {
        "folks": {
            "properties": {
                "name": {
                    "type": "keyword",
                    "fields": {
                        "pinyin": {
                            "type": "text",
                            "store": false,
                            "term_vector": "with_offsets",
                            "analyzer": "user_name_analyzer",
                            "boost": 10
                        }
                    }
                }
            }
        }
    }
    

    添加数据

    POST /medcl1/folks/_bulk
    {"index": {}}
    {"name": "张学友"}
    {"index": {}}
    {"name": "刘德华"}
    {"index": {}}
    {"name": "黎明"}
    {"index": {}}
    {"name": "郭富城"}
    {"index": {}}
    {"name": "刘德华 张学友 郭富城 黎明 四大天王"}
    

    搜索测试

    GET /medcl1/folks/_search
    {
        "query": {
            "match": {
                "name.pinyin": "ldh zxy"
            }
        }
    }
    

    搜索结果

    {
        "took": 85,
        "timed_out": false,
        "_shards": {
            "total": 5,
            "successful": 5,
            "skipped": 0,
            "failed": 0
        },
        "hits": {
            "total": 3,
            "max_score": 0.6931472,
            "hits": [{
                    "_index": "medcl1",
                    "_type": "folks",
                    "_id": "Z2EUYHIBkE_Scf7WBrTD",
                    "_score": 0.6931472,
                    "_source": {
                        "name": "张学友"
                    }
                },
                {
                    "_index": "medcl1",
                    "_type": "folks",
                    "_id": "aGEUYHIBkE_Scf7WBrTD",
                    "_score": 0.6931472,
                    "_source": {
                        "name": "刘德华"
                    }
                },
                {
                    "_index": "medcl1",
                    "_type": "folks",
                    "_id": "2WEcYHIBkE_Scf7WarSe",
                    "_score": 0.5753642,
                    "_source": {
                        "name": "刘德华 张学友 郭富城 黎明 四大天王"
                    }
                }
            ]
        }
    }
    

    相关文章

      网友评论

          本文标题:Elasticsearch拼音分词elasticsearch-a

          本文链接:https://www.haomeiwen.com/subject/dqmfzhtx.html