美文网首页
05-ElasticSearch分词

05-ElasticSearch分词

作者: Y了个J | 来源:发表于2020-08-13 19:37 被阅读0次
    分词

    一个tokenizer(分词器)接收一个字符流,将之分割为独立的tokens(词元,通常是独立的单词),然后输出tokens流。
    例如:whitespace tokenizer遇到空白字符时分割文本。它会将文本“Quick brown fox!”分割为[Quick,brown,fox!]。

    该tokenizer(分词器)还负责记录各个terms(词条)的顺序或position位置(用于phrase短语和word proximity词近邻查询),以及term(词条)所代表的原始word(单词)的start(起始)和end(结束)的character offsets(字符串偏移量)(用于高亮显示搜索的内容)。

    elasticsearch提供了很多内置的分词器,可以用来构建custom analyzers(自定义分词器)。
    关于分词器: https://www.elastic.co/guide/en/elasticsearch/reference/7.6/analysis.html

    POST _analyze
    {
      "analyzer": "standard",
      "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
    }
    

    执行结果:

    {
      "tokens" : [
        {
          "token" : "the",
          "start_offset" : 0,
          "end_offset" : 3,
          "type" : "<ALPHANUM>",
          "position" : 0
        },
        {
          "token" : "2",
          "start_offset" : 4,
          "end_offset" : 5,
          "type" : "<NUM>",
          "position" : 1
        },
        {
          "token" : "quick",
          "start_offset" : 6,
          "end_offset" : 11,
          "type" : "<ALPHANUM>",
          "position" : 2
        },
        {
          "token" : "brown",
          "start_offset" : 12,
          "end_offset" : 17,
          "type" : "<ALPHANUM>",
          "position" : 3
        },
        {
          "token" : "foxes",
          "start_offset" : 18,
          "end_offset" : 23,
          "type" : "<ALPHANUM>",
          "position" : 4
        },
        {
          "token" : "jumped",
          "start_offset" : 24,
          "end_offset" : 30,
          "type" : "<ALPHANUM>",
          "position" : 5
        },
        {
          "token" : "over",
          "start_offset" : 31,
          "end_offset" : 35,
          "type" : "<ALPHANUM>",
          "position" : 6
        },
        {
          "token" : "the",
          "start_offset" : 36,
          "end_offset" : 39,
          "type" : "<ALPHANUM>",
          "position" : 7
        },
        {
          "token" : "lazy",
          "start_offset" : 40,
          "end_offset" : 44,
          "type" : "<ALPHANUM>",
          "position" : 8
        },
        {
          "token" : "dog's",
          "start_offset" : 45,
          "end_offset" : 50,
          "type" : "<ALPHANUM>",
          "position" : 9
        },
        {
          "token" : "bone",
          "start_offset" : 51,
          "end_offset" : 55,
          "type" : "<ALPHANUM>",
          "position" : 10
        }
      ]
    }
    
    安装ik分词器

    所有的语言分词,默认使用的都是“Standard Analyzer”,但是这些分词器针对于中文的分词,并不友好。为此需要安装中文的分词器。

    Mac下因为文件夹下有.DS_Store文件导致安装分词器有点问题,可以先启动容器后进入容器内部进行安装

    docker exec -it elasticsearch /bin/bash #进入容器
    /usr/share/elasticsearch/bin
    ./elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.4.2/elasticsearch-analysis-ik-7.4.2.zip
    elasticsearch-plugin list  # 列出我们所有安装的插件,看有没有ik
    

    CentOS环境按下面安装
    https://github.com/medcl/elasticsearch-analysis-ik/releases 找对应es版本下载
    在前面安装的elasticsearch时,我们已经将elasticsearch容器的“/usr/share/elasticsearch/plugins”目录,映射到宿主机的 /mydata/elasticsearch/plugins 目录下,所以比较方便的做法就是下载“/elasticsearch-analysis-ik-7.4.2.zip”文件,然后解压到该文件夹下即可。安装完毕后,需要重启elasticsearch容器。

    cd /mydata/elasticsearch/plugins
    mkdir ik
    cd /mydata/elasticsearch/plugins/ik
    # 如果没有wget 命令先安装wget:yum -y install wget
    wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.4.2/elasticsearch-analysis-ik-7.4.2.zip
    unzip elasticsearch-analysis-ik-7.4.2.zip
    # 如果报 unzip: command not found的错误就执行下:yum install -y unzip zip 
    chmod -R 777 ik
    docker restart elasticsearch #重启elasticsearch
    docker exec -it elasticsearch /bin/bash #进入容器
    cd /usr/share/elasticsearch/plugins  #看有没有ik目录
    cd /usr/share/elasticsearch/bin
    elasticsearch-plugin -h
    elasticsearch-plugin list  # 列出我们所有安装的插件,看有没有ik
    

    还可以采用如下的方式。
    查看elasticsearch版本号:

    [root@hadoop-104 ~]# curl http://localhost:9200
    {
      "name" : "0adeb7852e00",
      "cluster_name" : "elasticsearch",
      "cluster_uuid" : "9gglpP0HTfyOTRAaSe2rIg",
      "version" : {
        "number" : "7.6.2",      #版本号为7.6.2
        "build_flavor" : "default",
        "build_type" : "docker",
        "build_hash" : "ef48eb35cf30adf4db14086e8aabd07ef6fb113f",
        "build_date" : "2020-03-26T06:34:37.794943Z",
        "build_snapshot" : false,
        "lucene_version" : "8.4.0",
        "minimum_wire_compatibility_version" : "6.8.0",
        "minimum_index_compatibility_version" : "6.0.0-beta1"
      },
      "tagline" : "You Know, for Search"
    }
    [root@hadoop-104 ~]# 
    

    进入es容器内部plugin目录:docker exec -it 容器id /bin/bash

    [root@hadoop-104 ~]# docker exec -it elasticsearch /bin/bash
    [root@0adeb7852e00 elasticsearch]# 
    [root@0adeb7852e00 elasticsearch]# pwd
    /usr/share/elasticsearch
    #下载ik7.4.2
    [root@0adeb7852e00 elasticsearch]# wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.4.2/elasticsearch-analysis-ik-7.4.2.zip
    [root@0adeb7852e00 elasticsearch]# unzip elasticsearch-analysis-ik-7.4.2.zip -d ink
    Archive:  elasticsearch-analysis-ik-7.4.2.zip
       creating: ik/config/
      inflating: ik/config/main.dic      
      inflating: ik/config/quantifier.dic  
      inflating: ik/config/extra_single_word_full.dic  
      inflating: ik/config/IKAnalyzer.cfg.xml  
      inflating: ik/config/surname.dic   
      inflating: ik/config/suffix.dic    
      inflating: ik/config/stopword.dic  
      inflating: ik/config/extra_main.dic  
      inflating: ik/config/extra_stopword.dic  
      inflating: ik/config/preposition.dic  
      inflating: ik/config/extra_single_word_low_freq.dic  
      inflating: ik/config/extra_single_word.dic  
      inflating: ik/elasticsearch-analysis-ik-7.6.2.jar  
      inflating: ik/httpclient-4.5.2.jar  
      inflating: ik/httpcore-4.4.4.jar   
      inflating: ik/commons-logging-1.2.jar  
      inflating: ik/commons-codec-1.9.jar  
      inflating: ik/plugin-descriptor.properties  
      inflating: ik/plugin-security.policy  
    [root@0adeb7852e00 elasticsearch]#
    #移动到plugins目录下
    [root@0adeb7852e00 elasticsearch]# mv ik plugins/
    [root@0adeb7852e00 elasticsearch]# rm -rf elasticsearch-analysis-ik-7.4.2.zip 
    
    测试分词器

    使用默认分词器

    GET my_index/_analyze
    {
       "text":"我是中国人"
    }
    

    执行结果:

    {
      "tokens" : [
        {
          "token" : "我",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "<IDEOGRAPHIC>",
          "position" : 0
        },
        {
          "token" : "是",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "<IDEOGRAPHIC>",
          "position" : 1
        },
        {
          "token" : "中",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "<IDEOGRAPHIC>",
          "position" : 2
        },
        {
          "token" : "国",
          "start_offset" : 3,
          "end_offset" : 4,
          "type" : "<IDEOGRAPHIC>",
          "position" : 3
        },
        {
          "token" : "人",
          "start_offset" : 4,
          "end_offset" : 5,
          "type" : "<IDEOGRAPHIC>",
          "position" : 4
        }
      ]
    }
    

    使用ik分词器

    GET my_index/_analyze
    {
       "analyzer": "ik_smart", 
       "text":"我是中国人"
    }
    
    或者
    
    GET my_index/_analyze
    {
       "analyzer": "ik_max_word", 
       "text":"我是中国人"
    }
    

    输出结果:

    {
      "tokens" : [
        {
          "token" : "我",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "CN_CHAR",
          "position" : 0
        },
        {
          "token" : "是",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "CN_CHAR",
          "position" : 1
        },
        {
          "token" : "中国人",
          "start_offset" : 2,
          "end_offset" : 5,
          "type" : "CN_WORD",
          "position" : 2
        }
      ]
    }
    

    相关文章

      网友评论

          本文标题:05-ElasticSearch分词

          本文链接:https://www.haomeiwen.com/subject/sqvbdktx.html