美文网首页
Elasticsearch---索引管理、基于scroll+bu

Elasticsearch---索引管理、基于scroll+bu

作者: 缓慢移动的蜗牛 | 来源:发表于2017-04-09 18:27 被阅读0次

    创建索引的语法

    PUT /my_index
    {
        "settings": { ... any settings ... },
        "mappings": {
            "type_one": { ... any mappings ... },
            "type_two": { ... any mappings ... },
            ...
        }
    }
    

    示例:

    PUT /my_index
    {
      "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
      },
      "mappings": {
        "my_type":{
          "properties": {
            "my_field":{
              "type": "text"
            }
          }
        }
      }
    }
    

    添加索引(索引一旦建立,不能修改)

    PUT /my_index/_settings
    {
      "number_of_replicas": 1
    }
    PUT /my_index/_mapping/my_type
    {
      "properties": {
        "my_field":{
          "type": "string"
        }
      }
    }
    

    删除索引

    DELETE /my_index
    DELETE /index_one,index_two
    DELETE /index_*
    DELETE /_all    //要想这样删除,需要修改config/elasticsearch.yml 中action.destructive_requires_name: false
    

    分词器的修改与定制

    • 修改分词器设置

    默认分词器是standard

    • standard tokenizer:已单词边界进行切分
    • standard token filter:什么都不做
    • lowercase token filter:将所有字符转换为小写
    • stop token filter(默认禁用):移除停用词
    PUT /my_index
    {
     "settings": {
       "analysis": {
         "analyzer": {
           "my_analyzer_std":{
             "type":"standard",
             "stopwords":"_english_"
           }
         }
       }
     }
    }
    //测试
    GET /my_index/_analyze
    {
     "analyzer": "my_analyzer_std",
     "text": "hello,you are the good boy"    // are the会被去掉
    }
    
    • 定制自己的分词器
    PUT /my_index
    {
      "settings": {
        "analysis": {
          "char_filter":{
            "&_to_and":{
              "type":"mapping",
              "mappings":["&=> and"]
            }
          },
          "filter": {
            "my_stopwords":{
              "type":"stop",
              "stopwords":["the","are"]
            }
          },
          "analyzer": {
            "my_analyzer":{
              "type":"custom",
              "char_filter":["html_strip","&_to_and"],
              "tokenizer":"standard",
              "filter":["lowercase","my_stopwords"]
            }
          }
        }
      }
    }
    //测试
    GET /my_index/_analyze
    {
      "analyzer": "my_analyzer",
      "text":"tom & jery,they are the good friend,<a>CLICK ME</a>"
    }
    

    type底层的数据结构

    type,是一个index中用来区分类似的数据的,类似的数据
    ,但是可能有不同的fields,而且有不同的属性来控制索引建立、分词器
    field的value,在底层的lucene中建立索引的时候,全部是opaque bytes(二进制)类型,不区分类型的。
    lucene是没有type的概念的,在document中,实际上将type作为一个document的field来存储,即_type,es通过_type来进行type的过滤和筛选
    一个index中的多个type,实际上是放在一起存储的,因此一个index下,不能有多个type重名,而类型或者其他设置不同的,因为那样是无法处理的

    //设置_mapping
    PUT /ecommerce
    {
     "mappings": {
       "elactronic_goods":{
         "properties": {
           "name":{
             "type": "string"
           },
           "price":{
             "type":"double"
           },
           "service_period":{
             "type":"string"
           }
         }
       },
       "fresh_goods":{
         "properties": {
           "name":{
             "type": "string"
           },
           "price":{
             "type": "double"
           },
           "eat_period":{
             "type":"string"
           }
         }
       }
     }
    }
    //查询_mapping
    GET /ecommerce/_mapping
    //存入document
    PUT /ecommerce/elactronic_goods/1
    {
     "name":"geli kongtiao",
     "price":3999,
     "service_period":"one year"
    }
    PUT /ecommerce/fresh_goods/1
    {
     "name":"da xia",
     "price":99,
     "eat_period":"one week"
    }
    

    底层数据结构是这样的

    {
       "ecommerce": {
          "mappings": {
            "_type": {
              "type": "string",
              "index": "not_analyzed"
            },
            "name": {
              "type": "string"
            }
            "price": {
              "type": "double"
            }
            "service_period": {
              "type": "string"
            }
            "eat_period": {
              "type": "string"
            }
          }
       }
    }
    //放入的document是这样的
    {
      "_type": "elactronic_goods",
      "name": "geli kongtiao",
      "price": 1999.0,
      "service_period": "one year",
      "eat_period": ""
    }
    {
      "_type": "fresh_goods",
      "name": "aozhou dalongxia",
      "price": 199.0,
      "service_period": "",
      "eat_period": "one week"
    }
    

    所以应该把类似结构的type放在一个index下,这些type应该有多个field是相同的,如果一个index的多个type的field完全不同,那个每条数据会有一大部分的field在底层lucene中是空值,会有严重的性能问题

    _mapping root object深入剖析

    • root object
      就是某个type对应的mapping json,包括了properties,metadata(_id,_source,_type),setting(analyzer),其他setting(比如include_in_all)
    PUT /my_index
    {
      "mappings": {
        "my_type":{
          "properties": {}
        }
      }
    }
    
    • properties
      包含有type,index,analyzer
    PUT /my_index/_mapping/my_type
    {
      "properties": {
        "title":{
          "type": "text"
        }
      }
    }
    
    • _source
    • 查询的时候,直接可以拿到完整的document,不需要先拿document id,再发送一次请求拿document
    • partial update基于_source实现
    • reindex时,直接基于_source实现,不需要从数据库(或者其他外部存储)查询数据再修改
    • 可以基于_source定制返回field
    • debug query更容易,因为可以直接看到_source

    如果不需要上述好处,可以禁用_source

    PUT /my_index/_mapping/my_type2
    {
      "_source": {"enabled": false}
    }
    
    • _all
      将所有field打包在一起,作为一个_all field,建立索引。没指定任何field进行搜索时,就是使用_all field在搜索。
    PUT /my_index/_mapping/my_type3
    {
      "_all": {"enabled": false}
    }
    

    也可以在field级别设置include_in_all field,设置是否要将field的值包含在_all field中

    PUT /my_index/_mapping/my_type4
    {
      "properties": {
        "my_field": {
          "type": "text",
          "include_in_all": false
        }
      }
    }
    

    dynamic mapping策略

    • 定制策略
    • true:遇到陌生字段,就进行dynamic mapping
    • false:遇到陌生字段,就忽略
    • strict:遇到陌生字段,就报错
    PUT /my_index
    {
     "mappings": {
       "my_type":{
         "dynamic":"strict",
         "properties": {
           "title":{
             "type": "text"
           },
           "address":{
             "type": "object",
             "dynamic":"true"
           }
         }
       }
     }
    }
    
    PUT /my_index/my_type/1
    {
     "content":"uuuu",
     "title":"hello world",
     "address":{
       "country":"china",
       "provice":"beiing"
     }
    }
    //结果
    {
     "error": {
       "root_cause": [
         {
           "type": "strict_dynamic_mapping_exception",
           "reason": "mapping set to strict, dynamic introduction of [content] within [my_type] is not allowed"
         }
       ],
       "type": "strict_dynamic_mapping_exception",
       "reason": "mapping set to strict, dynamic introduction of [content] within [my_type] is not allowed"
     },
     "status": 400
    }
    
    • date detection
      默认会按照一定格式识别date,比如yyyy-MM-dd。但是如果某个field先过来一个2017-01-01的值,就会被自动dynamic mapping成date,后面如果再来一个"hello world"之类的值,就会报错。可以手动关闭某个type的date_detection,如果有需要,自己手动指定某个field为date类型。
    PUT /my_index/_mapping/my_type
    {
      "date_detection": false
    }
    
    • 定制自己的dynamic mapping template(type level)
    PUT /my_index
    {
      "mappings": {
        "my_type":{
          "dynamic_templates":[
            {
              "en":{
                "match":"*_en",
                "match_mapping_type":"string",
                "mapping":{
                  "type":"string",
                  "analyzer":"english"
                }
              }
            }
          ]
        }
      }
    }
    //插入数据
    PUT /my_index/my_type/1
    {
      "title":"this is my first article"
    }
    PUT /my_index/my_type/2
    {
      "title_en":"this is my first article"
    }
    //查询
    //-------------------------------第一个
    //没有匹配到任何的dynamic模板
    //默认就是standard分词器,不会过滤停用词,is会进入倒排索引,用is来搜索是可以搜索到的
    GET /my_index/my_type/_search
    {
      "query": {
        "match": {
          "title": "is"
        }
      }
    }
    //-------------------------------第二个
    //匹配到了dynamic模板,就是english分词器,会过滤停用词,is这种停用词就会被过滤掉,用is来搜索就搜索不到了
    GET /my_index/my_type/_search
    {
      "query": {
        "match": {
          "title_en": "is"
        }
      }
    }
    
    • 定制自己的default mapping template(index level)
    PUT /my_index
    {
      "mappings": {
        "_default_":{
          "_all":{
            "enabled":false
          }
        },
       "blog":{
          "_all":{
            "enabled":true
          }
        }
      }
    }
    

    基于scroll+bulk的索引重建

    一个field的设置是不能被修改的,如果要修改一个field,那么应该重新按照新的mapping,建立一个index,然后将数据批量查询出来,重新用bulk api写入index中
    批量查询的时候,建议才用scroll api,并且才用多线程并发的方式来reindex数据,每次scroll就查询指定日期的一段数据,交个一个线程即可

    • 插入模拟数据,但是不小心有些数据时2017-01-01这种日期格式,所以title这种field就被自动映射为date类型,实际上他应该是string类型
    PUT /my_index/my_type/1
    {
      "title":"2017-01-01"
    }
    PUT /my_index/my_type/2
    {
      "title":"2017-01-02"
    }
    PUT /my_index/my_type/3
    {
      "title":"2017-01-03"
    }
    
    • 然后向索引中加入string类型的title值得时候,会报错
    PUT /my_index/my_type/4
    {
      "title":"my first article"
    }
    {
      "error": {
        "root_cause": [
          {
            "type": "mapper_parsing_exception",
            "reason": "failed to parse [title]"
          }
        ],
        "type": "mapper_parsing_exception",
        "reason": "failed to parse [title]",
        "caused_by": {
          "type": "illegal_argument_exception",
          "reason": "Invalid format: \"my first article\""
        }
      },
      "status": 400
    }
    //查看其mapping
    GET /my_index/_mapping/my_type
    {
      "my_index": {
        "mappings": {
          "my_type": {
            "properties": {
              "title": {
                "type": "date"
              }
            }
          }
        }
      }
    }
    
    • 此时尝试修改title的类型
    PUT /my_index/_mapping/my_type
    {
      "properties": {
        "title":{
          "type": "string"
        }
      }
    }
    //返回结果
    {
      "error": {
        "root_cause": [
          {
            "type": "illegal_argument_exception",
            "reason": "mapper [title] of different type, current_type [date], merged_type [text]"
          }
        ],
        "type": "illegal_argument_exception",
        "reason": "mapper [title] of different type, current_type [date], merged_type [text]"
      },
      "status": 400
    }
    
    • 此时,唯一的办法就是进行reindex(重建索引),将旧索引的数据查询出来,再导入新索引
    • 给旧索引起一个别名,这个别名指向旧的索引,如果应用程序在使用,可以用这个别名索引
    PUT /my_index/_alias/goods_index
    
    • 新建一个index,调整其title类型为string
    PUT /my_index_new
    {
      "mappings": {
        "my_type":{
          "properties": {
            "title":{
              "type": "string"
            }
          }
        }
      }
    }
    
    • 使用scroll api将数据批量查出来
    GET /my_index/_search?scroll=1m
    {
      "query": {
        "match_all": {}
      },
      "sort":["_doc"],
      "size":1
    }
    
    • 采用bulk api将scroll查询出来的一批数据,批量写入新的索引中
    POST /_bulk
    {"index":{"_index":"my_index_new","_type":"my_type","_id":"2"}}
    {"title":"2017-01-02"}
    
    • 重复上面的两个步骤,把所有的数据都写入新的索引
    GET /_search/scroll
    {
      "scroll":"1m",
      "scroll_id":"DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAC6bFlhIb1FOME82U3llb202bER1Zm95VkEAAAAAAAAumBZYSG9RTjBPNlN5ZW9tNmxEdWZveVZBAAAAAAAALpwWWEhvUU4wTzZTeWVvbTZsRHVmb3lWQQAAAAAAAC6ZFlhIb1FOME82U3llb202bER1Zm95VkEAAAAAAAAumhZYSG9RTjBPNlN5ZW9tNmxEdWZveVZB"
    }
    POST /_bulk
    {"index":{"_index":"my_index_new","_type":"my_type","_id":"...."}}
    {"title":"..."}
    
    • 将goods_index alias切换到my_index_new上去,
    POST /_aliases
    {
      "actions": [
        {
          "remove": {"index":"my_index","alias": "goods_index"}
        },
        {
          "add":{"index":"my_index_new","alias": "goods_index"}
        }
      ]
    }
    

    相关文章

      网友评论

          本文标题:Elasticsearch---索引管理、基于scroll+bu

          本文链接:https://www.haomeiwen.com/subject/zbldattx.html