美文网首页
elasticsearch java client 自定义分词器

elasticsearch java client 自定义分词器

作者: lz做过前端 | 来源:发表于2023-07-05 14:42 被阅读0次

背景说明

  • 如果直接生成索引,不做任何设置,使用的是默认的 keyword 类型,使用的是默认的 standard 的分词器。该分词器不支持对字母和数字分词。
  • 所以我们需要在在生成索引之前对其进行设置

API 探索(两种配置方式)

生成索引之前设置

  1. API.2 删除索引
  2. API.4 全局设置
  3. API.7 插入数据
  4. API.11 查看配置
  5. API.12 模糊搜索测试

先生成索引后设置

  1. API.2 删除索引
  2. API.8 插入随意数据
  3. API.9 关闭索引
  4. API.5 配置
  5. API.6 设置
  6. API.10 打开索引
  7. API.3 删除随意数据
  8. API.11 查看配置
  9. API.12 模糊搜索测试

Java client

<properties>
    <elasticsearch.new.version>8.7.1</elasticsearch.new.version>
</properties>

<!-- elasticsearch -->
<dependency>
    <groupId>co.elastic.clients</groupId>
    <artifactId>elasticsearch-java</artifactId>
</dependency>
<dependency>
    <groupId>org.elasticsearch.client</groupId>
    <artifactId>elasticsearch-rest-client</artifactId>
    <version>${elasticsearch.new.version}</version>
</dependency>

目前测试下来 Java client 只支持 先生成索引后设置,大致的代码如下:

ElasticsearchIndicesClient indicesClient = elasticsearchClient.indices();
@Data
@AllArgsConstructor
class Fund {
    private String innerCode;
}
String deleteId = "pre_add_end_delete";
String analyzer = "my_ngram_analyzer";
String tokenizer = "ngram";
List<String> fields = List.of("secuCode", "secuAbbr");
configAnalyzer(indicesClient, targetName, deleteId, new Fund(deleteId), analyzer, tokenizer, fields);

@SneakyThrows
private void configAnalyzer(ElasticsearchIndicesClient indicesClient, String targetName, String deleteId, Object deleteDocument, String analyzer, String tokenizer, List<String> fields) {
    elasticsearchClient.index(i -> i.index(targetName).id(deleteId).document(deleteDocument));

    // close 否则不允许 setting
    indicesClient.close(x -> {
        x.index(List.of(targetName));
        elasticSearchService.printDsl(CloseIndexRequest.class, CloseIndexRequest.Builder.class, x);
        return x;
    });

    // 设置自定义分词器
    indicesClient.putSettings(x -> {
        x.index(targetName).settings(y -> y
                .maxResultWindow(elasticSearchProperty.getDefaultFundSize())
                .analysis(z -> z.analyzer(analyzer, a -> a.custom(b -> b.tokenizer(tokenizer)))));
        elasticSearchService.printDsl(PutIndicesSettingsRequest.class, PutIndicesSettingsRequest.Builder.class, x);
        return x;
    });

    // 将自定义分词器配置到字段上
    indicesClient.putMapping(x -> {
        x.index(List.of(targetName));
        fields.forEach(f -> {
            x.properties(f, Property.of(y -> y.text(z -> z
                    .analyzer(analyzer)
                    .fields(Property.Kind.Keyword.jsonValue(),a -> a.keyword(b -> b.ignoreAbove(256))))));
        });
        elasticSearchService.printDsl(PutMappingRequest.class, PutMappingRequest.Builder.class, x);
        return x;
    });

    // 打开,否则不允许 修改数据
    indicesClient.open(x -> {
        x.index(List.of(targetName));
        elasticSearchService.printDsl(OpenRequest.class, OpenRequest.Builder.class, x);
        return x;
    });

    // 最后将初始化的数据删除
    elasticsearchClient.delete(i -> i.index(targetName).id(deleteId));
}

客户端配置

@Data
@Component
@ConfigurationProperties(prefix = "elastic-search")
public class ElasticSearchProperty implements Serializable {
    private String hostname;
    private int port;

    private int connectTimeout;

    private int socketTimeout;

    private int defaultFundSize = 9999;

    private int defaultManagerSize = 9999;

    private int limitSize = 200 * 1024 * 1024;
}

@Bean
public ElasticsearchClient buildElasticsearchClient(ElasticSearchProperty property) {
    HttpHost httpHost = new HttpHost(property.getHostname(), property.getPort());

    // https://stackoverflow.com/questions/71142680/co-elastic-clients-transport-transportexception-es-search-missing-x-elastic
    // The default headers the RestClientBuilder allows you to specify are the request headers, not the response headers. The error you are getting is because older Elasticsearch [server] versions do not include the X-Elastic-Product=Elasticsearch header in any of the API responses, but the recent distributions do (7.14+?), so the newer versions of elasticsearch-java (i.e. client) expects them.
    RestClientBuilder.HttpClientConfigCallback httpClientConfigCallback = httpClientBuilder ->
            httpClientBuilder
                    .setDefaultRequestConfig(RequestConfig.custom()
                            .setConnectTimeout(property.getConnectTimeout())
                            .setSocketTimeout(property.getSocketTimeout())
                            .build())
                    // java.io.IOException: Connection reset by peer
                    // https://cloud.tencent.com/developer/article/1943055
                    .setDefaultIOReactorConfig(IOReactorConfig.custom()
                            .setSoKeepAlive(true)
                            .build())
                    .setDefaultCredentialsProvider(new BasicCredentialsProvider())
                    .setDefaultHeaders(List.of(new BasicHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString())))
                    .addInterceptorLast((HttpResponseInterceptor) (response, context) -> response.addHeader("X-Elastic-Product", "Elasticsearch"));
    var restClient = RestClient.builder(httpHost)
            .setHttpClientConfigCallback(httpClientConfigCallback)
            .build();
    JacksonJsonpMapper mapper = new JacksonJsonpMapper();
    ObjectMapper objectMapper = mapper.objectMapper();
    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);

    // LocalDate LocalDateTime 解析器
    objectMapper.registerModule(new JavaTimeModule());

    // default:/index_name/_search?typed_keys=true => /index_name/_search?typed_keys=false
    // Elasticsearch search requests accept a typed_key parameter that allow returning type information along with the name in aggregation and suggestion results (see the aggregations documentation for additional details).
    // However, in some use cases serializing objects in the typed_keys format may not be desirable, for example when the Java API Client is used in an application that acts as a front-end to other services that expect the default format for aggregations and suggestions.
    // You can disable typed_keys serialization by setting the JsonpMapperFeatures.SERIALIZE_TYPED_KEYS attribute to false on your mapper object:
    // mapper.withAttribute(JsonpMapperFeatures.SERIALIZE_TYPED_KEYS, false);
    // result:"\"aggregations\":{\"avg#price\":{\"value\":3.14}}}") VS "\"aggregations\":{\"price\":{\"value\":3.14}}}"
    // avg returned
    ElasticsearchTransport transport = new RestClientTransport(restClient, mapper);

    // bufferLimit
    RequestOptions.Builder requestOptionsBuilder = RequestOptions.DEFAULT.toBuilder();
    requestOptionsBuilder.setHttpAsyncResponseConsumerFactory(new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory(property.getLimitSize()));
    RestClientOptions myOptions = new RestClientOptions(requestOptionsBuilder.build());

    return new ElasticsearchClient(transport, myOptions);
}

API

  1. 查看数据
get /person/_search
  1. 删除索引
delete /person
  1. 删除其中一条数据
delete /person/_doc/pre_add_end_delete_id
  1. 索引生成前对索引进行设置(自定义分词器+对字段进行设置)
PUT /person
{
  "settings": {
    "index": {
      "max_result_window": 50000
    },
    "analysis": {
      "analyzer": {
        "my_ngram_analyzer":{
          "type":"custom",
          "char_filter":[
              "my_html_strip",
              "my_punctuation_mapping"
            ],
          "tokenizer": "my_tokenizer",
          "filter": ["my_stop_token_filter"]
        }
      },
      "char_filter": {
        "my_punctuation_mapping":{
          "type":"mapping",
          "mappings":["* => _","= => ~"]
        },
        "my_html_strip":{
          "type":"html_strip"
        }
      },
      "tokenizer": {
        "my_tokenizer":{
          "type":"ngram"
        }
      },
      "filter": {
        "my_stop_token_filter":{
          "type":"stop",
          "ignore_case":true,
          "stopwords": ["is","a","the"]
        }
      }
    }
  },
  "mappings": {
    "properties": {
        "first_name" : {
          "type": "text",
          "analyzer": "my_ngram_analyzer"
        },
        "interests" : {
          "type" : "text",
          "analyzer": "my_ngram_analyzer",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
    }
  }
}
  1. 先有索引在进行设置(简化版本)
PUT /person/_settings
{
  "analysis": {
    "analyzer": {
      "my_ngram_analyzer":{
        "type":"custom",
        "char_filter":[],
        "tokenizer": "ngram",
        "filter": []
      }
    }
  }
}
  1. 先有索引在进行设置(对字段设置)
POST /person/_mapping
{
  "properties": {
      "first_name" : {
        "type": "text",
        "analyzer": "my_ngram_analyzer"
      },
      "interests" : {
        "type" : "text",
        "analyzer": "my_ngram_analyzer",
        "fields" : {
          "keyword" : {
            "type" : "keyword",
            "ignore_above" : 256
          }
        }
      }
    }
}
  1. 插入测试数据
PUT /person/_doc/1
{
    "first_name" : "400001.OF",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock bbb",
    "about_true" :      "I love to go rock bbb",
    "interests": [ "400001.OF", "music" ],
    "someday": "1989-02-09"
}

PUT /person/_doc/2
{
    "first_name" : "付水电费水电费",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock bbb",
    "about_true" :      "I love to go rock bbb",
    "interests": [ "400001.OF", "music" ],
    "someday": "1989-02-09"
}
  1. 插入非设置字段数据
PUT /person/_doc/pre_add_end_delete_id
{
    "age" : 25
}
  1. 关闭索引
POST /person/_close
  1. 打开索引
POST /person/_open
  1. 查看索引设置
GET person/_settings
GET /person/_mapping
  1. 模糊搜索
POST /person/_search
{
  "query": {
    "match_phrase": {
      "first_name": "001"
    }
  }
}

POST /person/_search
{
  "query": {
    "match_phrase": {
      "first_name": "费水"
    }
  }
}

POST /person/_search
{
  "query": {
    "match_phrase": {
      "interests": "001"
    }
  }
}

相关文章

网友评论

      本文标题:elasticsearch java client 自定义分词器

      本文链接:https://www.haomeiwen.com/subject/frskudtx.html