美文网首页
Aggregation

Aggregation

作者: 逸章 | 来源:发表于2020-07-17 11:42 被阅读0次

    语法结构:



    图片.png

    例一

    1、定义sports这个index的mapping

    {
      "mappings": {
        "properties": {
          "birthdate": {
            "type": "date",
            "format": "dateOptionalTime"
          },
          "location": {
            "type": "geo_point"
          },
          "name": {
            "type": "keyword"
          },
          "rating": {
            "type": "integer"
          },
          "sport": {
            "type": "keyword"
          }
        }
      }
    }
    
    图片.png

    2、通过BULK API插入数据

    curl -H "Content-Type: application/json" -X POST http://localhost:9200/_bulk --data '
    {"index":{"_index":"sports"}}
    {"name":"Michael","birthdate":"1989-10-1","sport":"Baseball","rating":["5","4"],"location":"46.22,-68.45"}
    {"index":{"_index":"sports"}}
    {"name":"Bob","birthdate":"1989-11-2","sport":"Baseball","rating":["3","4"],"location":"45.21,-68.35"}
    {"index":{"_index":"sports"}}
    {"name":"Jim","birthdate":"1988-10-3","sport":"Baseball","rating":["3","2"],"location":"45.16,-63.58"}
    {"index":{"_index":"sports"}}
    {"name":"Joe","birthdate":"1992-5-20","sport":"Baseball","rating":["4","3"],"location":"45.22,-68.53"}
    {"index":{"_index":"sports"}}
    {"name":"Tim","birthdate":"1992-2-28","sport":"Baseball","rating":["3","3"],"location":"46.22,-68.85"}
    {"index":{"_index":"sports"}}
    {"name":"Alfred","birthdate":"1990-9-9","sport":"Baseball","rating":["2","2"],"location":"45.12,-68.35"}
    {"index":{"_index":"sports"}}
    {"name":"Jeff","birthdate":"1990-4-1","sport":"Baseball","rating":["2","3"],"location":"46.12,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Will","birthdate":"1988-3-1","sport":"Baseball","rating":["4","4"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Mick","birthdate":"1989-10-1","sport":"Baseball","rating":["3","4"],"location":"46.22,-68.45"}
    {"index":{"_index":"sports"}}
    {"name":"Pong","birthdate":"1989-11-2","sport":"Baseball","rating":["1","3"],"location":"45.21,-68.35"}
    {"index":{"_index":"sports"}}
    {"name":"Ray","birthdate":"1988-10-3","sport":"Baseball","rating":["2","2"],"location":"45.16,-63.58"}
    {"index":{"_index":"sports"}}
    {"name":"Ping","birthdate":"1992-5-20","sport":"Baseball","rating":["4","3"],"location":"45.22,-68.53"}
    {"index":{"_index":"sports"}}
    {"name":"Duke","birthdate":"1992-2-28","sport":"Baseball","rating":["5","2"],"location":"46.22,-68.85"}
    {"index":{"_index":"sports"}}
    {"name":"Hal","birthdate":"1990-9-9","sport":"Baseball","rating":["4","2"],"location":"45.12,-68.35"}
    {"index":{"_index":"sports"}}
    {"name":"Charge","birthdate":"1990-4-1","sport":"Baseball","rating":["3","2"],"location":"46.12,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Barry","birthdate":"1988-3-1","sport":"Baseball","rating":["5","2"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Bank","birthdate":"1988-3-1","sport":"Golf","rating":["6","4"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Bingo","birthdate":"1988-3-1","sport":"Golf","rating":["10","7"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"James","birthdate":"1988-3-1","sport":"Basketball","rating":["10","8"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Wayne","birthdate":"1988-3-1","sport":"Hockey","rating":["10","10"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Brady","birthdate":"1988-3-1","sport":"Football","rating":["10","10"],"location":"46.25,-68.55"}
    {"index":{"_index":"sports"}}
    {"name":"Lewis","birthdate":"1988-3-1","sport":"Football","rating":["10","10"],"location":"46.25,-68.55"}
    '
    
    用post演示如下: 图片.png

    3、确认插入了多少条数据(应当22条)

    yay@yay-ThinkPad-T470-W10DG:~$ curl -XGET http://localhost:9200/sports/_count
    {"count":22,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0}}
    yay@yay-ThinkPad-T470-W10DG:~$ 
    

    4、做聚合运算

    4.1 按照name做聚合,求score的均值

    {
        "size": 0,
        "aggregations": {
            "the_name": {
                "terms": {
                    "field": "name",
                    "order": {
                        "rating_avg": "desc"
                    }
                },
                "aggregations": {
                    "rating_avg": {
                        "avg": {
                            "field": "rating"
                        }
                    }
                }
            }
        }
    }
    
    图片.png

    4.2 在聚合中使用script脚本生成聚合所使用的值

    比如统计30-31间的有几个人

    GET sports/_search
    {
      "size": 0,
      "aggs": {
        "age_range": {
          "range": {
            "script": {
              "source": 
                """
                ZonedDateTime dob = doc['birthdate'].value;
                return params.now - dob.getYear()
                """
                ,
              "params": {
                "now": 2019
              }
            },
            "ranges": [
              {
                "from": 30,
                "to": 31
              }
            ]
          }
        }
      }
    }
    
    图片.png

    二、Metric Aggregations(指标聚合)

    图片.png

    2.1 value_count聚合

    指定字段有值的文档中该字段值的个数(有些稳定该字段是个数组)

    GET sports/_search
    {
      "size": 0,
      "aggs": {
        "sport_count": {
          "value_count": {
            "field": "rating"
          }
        }
      }
    }
    
    图片.png

    三、Bucket Aggregations(存储桶聚合)

    存储桶聚合是用于对文档进行分组的机制。 每种类型的存储桶聚合都有自己的分割文档集的方法。 也许最简单的类型是term聚合。 这个功能返回给定字段索引的唯一term以及匹配文档的数量。

    3.1 Term Aggregation

    GET sports/_search
    {
      "size": 0,
      "aggs": {
        "sport": {
          "terms": {
            "field": "sport",
            "size": 10
          }
        }
      }
    }
    
    图片.png

    3.2 geo相关

    3.2.1 geo_distance聚合

    在最简单的情况下,它取一个原点和一个距离范围,然后根据给定的geo_point字段计算圆中有多少文档。

    GET sports/_search
    {
      "size": 0,
      "aggregations": {
        "baseball_player_ring": {
          "geo_distance": {
            "field": "location",
            "origin": "46.12,-68.55",
            "unit": "mi",
            "ranges": [
              {
                "from": 0,
                "to": 20
              }
            ]
          }
        }
      }
    }
    
    图片.png

    3.2.2 geohash_grid聚合

    GET sports/_search
    {
      "size": 0,
       "aggregations": {
            "large-grid": {
                "geohash_grid": {
                    "field": "location",
                    "precision": 3
                }
            }
        }
    }
    
    图片.png

    3.3 内嵌 Bucket Aggregations

    我们可以使用按年龄划分的嵌套范围聚合(根据脚本的“出生日期”计算得出)来进一步细分geo_distance聚合的结果。 假设我们想知道属于两个年龄段的每个运动员中有多少运动员(他们生活在上一节中定义的圈子内)

    GET sports/_search
    {
       "size": 0,
       "aggregations": {
          "baseball_player_ring": {
             "geo_distance": {
                "field": "location",
                "origin": "46.12,-68.55",
                "unit": "mi",
                "ranges": [
                   {
                      "from": 0,
                      "to": 20
                   }
                ]
             },
             "aggregations": {
                "ring_age_ranges": {
                   "range": {
                     "script": {
                        "source": 
                        """
                        ZonedDateTime dob = doc['birthdate'].value;
                        return params.now - dob.getYear()
                        """
                        ,
                      "params": {
                        "now": 2019
                      }                 
                     }, 
                      "ranges": [
                          { "from": 30, "to": 31 },
                          { "from": 31, "to": 32 }
                      ]
                   }
                }
             }
          }
       }
    }
    
    图片.png

    3.4 多值指标汇总器

    使用stats(多值指标汇总器)来计算最内部结果的一些统计数据。 对于居住在我们圈子中的运动员以及两个年龄段的每个年龄段,我们现在都希望根据结果文档计算“rating”字段的统计信息:

    GET sports/_search
    {
       "size": 0,
       "aggregations": {
          "baseball_player_ring": {
             "geo_distance": {
                "field": "location",
                "origin": "46.12,-68.55",
                "unit": "mi",
                "ranges": [
                   {
                      "from": 0,
                      "to": 20
                   }
                ]
             },
             "aggregations": {
                "ring_age_ranges": {
                   "range": {
                     "script": {
                        "source": 
                        """
                        ZonedDateTime dob = doc['birthdate'].value;
                        return params.now - dob.getYear()
                        """
                      ,
                      "params": {
                        "now": 2019
                      }                 
                     }, 
                      "ranges": [
                          { "from": 30, "to": 31 },
                          { "from": 31, "to": 32 }
                      ]
                   },
                  "aggregations": {
                    "rating_stats": {
                      "stats": {
                          "field": "rating"
                        }
                    }
                  }
                }
             }
          }
       }
    }
    
    图片.png

    3.5 significant terms聚合

    这些聚合旨在搜索数据集中有趣和/或不寻常(uncommonly common)的术语,这些术语可以告诉您有关数据的隐藏属性的更多信息: 图片.png

    Once you start using significant_terms , you find many situations where you don’t want the most popular—you want the most uncommonly common(不寻常). This simple aggregation can uncover
    some surprisingly sophisticated trends in your data.

    比如针对一部不错的电影,我还要找出其他类似的符合我口味的电影,方法是找出有哪些人推荐了这部电影,然后找出这些人推荐的所有电影中推荐次数最高的前几个

    例一

    定义index的mappings

    PUT news
    {
      "mappings": {
        "properties": {
          "published": {
            "type": "date",
            "format": "dateOptionalTime"
          },
          "author": {
            "type": "keyword"
          },
          "title": {
            "type": "text"
          },
          "topic": {
            "type": "keyword"
          },
          "views": {
            "type": "integer"
          }
        }
      }
    }
    

    关键字字段只能按其确切值进行搜索,而文本字段可用于全文搜索。

    批量插入测试数据

    POST news/_bulk
    {"index":{"_index":"news"}}
    {"author":"John Michael","published":"2018-07-08","title":"Tesla is flirting with its lowest close in over 1 1/2 years (TSLA)","topic":"automobile","views":"431"}
    {"index":{"_index":"news"}}
    {"author":"John Michael","published":"2018-07-22","title":"Tesla to end up like Lehman Brothers (TSLA)","topic":"automobile","views":"1921"}
    {"index":{"_index":"news"}}
    {"author":"John Michael","published":"2018-07-29","title":"Tesla (TSLA) official says that they are going to release a new self-driving car model in the coming year","topic":"automobile","views":"1849"}
    {"index":{"_index":"news"}}
    {"author":"John Michael","published":"2018-08-14","title":"Five ways Tesla uses AI and Big Data","topic":"ai","views":"871"}
    {"index":{"_index":"news"}}
    {"author":"John Michael","published":"2018-08-14","title":"Toyota partners with Tesla (TSLA) to improve the security of self-driving cars","topic":"automobile","views":"871"}
    {"index":{"_index":"news"}}
    {"author":"Robert Cann","published":"2018-08-25","title":"Is AI dangerous for humanity","topic":"ai","views":"981"}
    {"index":{"_index":"news"}}
    {"author":"Robert Cann","published":"2018-09-13","title":"Is AI dangerous for humanity","topic":"ai","views":"871"}
    {"index":{"_index":"news"}}
    {"author":"Robert Cann","published":"2018-09-27","title":"Introduction to Generative Adversarial Networks (GANs) in self-driving cars","topic":"automobile","views":"1183"}
    {"index":{"_index":"news"}}
    {"author":"Robert Cann","published":"2018-10-09","title":"Introduction to Natural Language Processing","topic":"ai","views":"786"}
    {"index":{"_index":"news"}}
    {"author":"Robert Cann","published":"2018-10-15","title":"New Distant Objects Found in the Fight for Planet X ","topic":"astronomy","views":"542"}
    

    使用significant term查询
    A.下面尝试在索引中查找每个author的重要topics

    GET news/_search
    {
      "size": 0,
      "aggregations": {
        "authors": {
          "terms": {
            "field": "author"
          },
          "aggregations": {
            "significant_topic_types": {
              "significant_terms": {
                "field": "topic"
              }
            }
          }
        }
      }
    }
    
    图片.png
    B.尝试查找特定author的重要topics
    GET news/_search
    {
      "size": 0, 
      "query": {
        "term": {
          "author": "John Michael"
        }
      },
      "aggregations": {
        "significant_topics": {
          "significant_terms": {
            "field": "topic"
          }
        }
      }
    }
    
    图片.png

    3.6 significant text聚合

    针对significant text aggregation,基本它和significant terms aggregation非常相似,只是它作用于一个text字段而不是一个keyword字段

    GET news/_search
    {
      "size": 0, 
      "query": {
        "match": {
          "title": "Tesla ai"
        }
      },
      "aggregations": {
        "significant_topics": {
          "significant_text": {
            "field": "topic"
          }
        }
      }
    }
    
    图片.png

    相关文章

      网友评论

          本文标题:Aggregation

          本文链接:https://www.haomeiwen.com/subject/hrvxkktx.html