美文网首页
HanLPTokenizer HanLP分词器

HanLPTokenizer HanLP分词器

作者: lanlantian123 | 来源:发表于2018-11-15 14:50 被阅读0次

    anlp在功能上的扩展主要体现在以下几个方面:

    •关键词提取 

    •自动摘要

    •短语提取 

    •拼音转换

    •简繁转换

    •文本推荐

    下面是 hanLP分词器的代码

    注:使用maven依赖 

    <dependency>  

       <groupId>com.hankcs</groupId>  

       <artifactId>hanlp</artifactId>  

       <version>portable-1.3.4</version>  

    </dependency> 

    使用了java8进行处理

    import java.util.ArrayList;

    import java.util.List;

    import java.util.stream.Collectors;

    import org.apache.commons.lang3.StringUtils;

    import com.hankcs.hanlp.seg.Segment;

    import com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment;

    import com.hankcs.hanlp.seg.NShort.NShortSegment;

    import com.hankcs.hanlp.tokenizer.IndexTokenizer;

    import com.hankcs.hanlp.tokenizer.NLPTokenizer;

    import com.hankcs.hanlp.tokenizer.SpeedTokenizer;

    import com.hankcs.hanlp.tokenizer.StandardTokenizer;

    public class HanLPTokenizer {

    private static final Segment N_SHORT_SEGMENT = new NShortSegment().enableCustomDictionary(false)

    .enablePlaceRecognize(true).enableOrganizationRecognize(true);

    private static final Segment DIJKSTRA_SEGMENT = new DijkstraSegment().enableCustomDictionary(false)

    .enablePlaceRecognize(true).enableOrganizationRecognize(true);

    /**

    * 标准分词

    * @param text

    * @return

    */

    public static List<String> standard(String text) {

    List<String> list = new ArrayList<String>();

    StandardTokenizer.segment(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list.stream().distinct().collect(Collectors.toList());

    }

    /**

    * NLP分词

    * @param text

    * @return

    */

    public static List<String> nlp(String text) {

    List<String> list = new ArrayList<String>();

    NLPTokenizer.segment(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list.stream().distinct().collect(Collectors.toList());

    }

    /**

    * 索引分词

    * @param text

    * @return

    */

    public static List<String> index(String text) {

    List<String> list = new ArrayList<String>();

    IndexTokenizer.segment(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list.stream().distinct().collect(Collectors.toList());

    }

    /**

    * 极速词典分词

    * @param text

    * @return

    */

    public static List<String> speed(String text) {

    List<String> list = new ArrayList<String>();

    SpeedTokenizer.segment(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list;

    }

    /**

    * N-最短路径分词

    * @param text

    * @return

    */

    public static List<String> nShort(String text) {

    List<String> list = new ArrayList<String>();

    N_SHORT_SEGMENT.seg(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list.stream().distinct().collect(Collectors.toList());

    }

    /**

    * 最短路径分词

    * @param text

    * @return

    */

    public static List<String> shortest(String text) {

    List<String> list = new ArrayList<String>();

    DIJKSTRA_SEGMENT.seg(text).forEach(term -> {

    if (StringUtils.isNotBlank(term.word)) {

    list.add(term.word);

    }

    });

    return list.stream().distinct().collect(Collectors.toList());

    }

    public static void main(String[] args) {

    String text = "测试勿动12";

    System.out.println("标准分词:" + standard(text));

    System.out.println("NLP分词:" + nlp(text));

    System.out.println("索引分词:" + index(text));

    System.out.println("N-最短路径分词:" + nShort(text));

    System.out.println("最短路径分词分词:" + shortest(text));

    System.out.println("极速词典分词:" + speed(text));

    }

    }

    文章来源于猴德华的博客

    相关文章

      网友评论

          本文标题:HanLPTokenizer HanLP分词器

          本文链接:https://www.haomeiwen.com/subject/rwsyfqtx.html