博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
lucene IKAnalyzer中文分词器学习(1)
阅读量:6188 次
发布时间:2019-06-21

本文共 3717 字,大约阅读时间需要 12 分钟。

hot3.png

org.apache.lucene
lucene-core
5.3.1
org.apache.lucene
lucene-analyzers-common
5.3.1
org.apache.lucene
lucene-analyzers-smartcn
5.3.1
org.apache.lucene
lucene-queryparser
5.3.1
org.apache.lucene
lucene-highlighter
5.3.1
com.janeluo
ikanalyzer
2012_u6

IKAnalyzer不支持5.x的lucene  添加下面两个类即可

import org.apache.lucene.analysis.Analyzer;public class IKAnalyzer5x extends Analyzer {    private boolean useSmart;    public boolean useSmart() {        return this.useSmart;    }    public void setUseSmart(boolean useSmart) {        this.useSmart = useSmart;    }    public IKAnalyzer5x() {        this(false);    }    public IKAnalyzer5x(boolean useSmart) {        this.useSmart = useSmart;    }    @Override    protected TokenStreamComponents createComponents(String fieldName) {        IKTokenizer5x _IKTokenizer = new IKTokenizer5x(this.useSmart);        return new TokenStreamComponents(_IKTokenizer);    }}
import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import org.apache.lucene.util.AttributeFactory;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;import java.io.IOException;public class IKTokenizer5x extends Tokenizer {    private IKSegmenter _IKImplement;    private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);    private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);    private final TypeAttribute typeAtt = (TypeAttribute)this.addAttribute(TypeAttribute.class);    private int endPosition;    public IKTokenizer5x() {        this._IKImplement = new IKSegmenter(this.input, true);    }    public IKTokenizer5x(boolean useSmart) {        this._IKImplement = new IKSegmenter(this.input, useSmart);    }    public IKTokenizer5x(AttributeFactory factory) {        super(factory);        this._IKImplement = new IKSegmenter(this.input, true);    }    public boolean incrementToken() throws IOException {        this.clearAttributes();        Lexeme nextLexeme = this._IKImplement.next();        if(nextLexeme != null) {            this.termAtt.append(nextLexeme.getLexemeText());            this.termAtt.setLength(nextLexeme.getLength());            this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());            this.endPosition = nextLexeme.getEndPosition();            this.typeAtt.setType(nextLexeme.getLexemeTypeString());            return true;        } else {            return false;        }    }    public void reset() throws IOException {        super.reset();        this._IKImplement.reset(this.input);    }    public final void end() {        int finalOffset = this.correctOffset(this.endPosition);        this.offsetAtt.setOffset(finalOffset, finalOffset);    }}

调用测试分词器

Analyzer analyzer = new IKAnalyzer5x(true);TokenStream tokenStream = analyzer.tokenStream("test", "一个新款韩版长袖羊驼绒羊毛皮草外套女海宁皮草");OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);tokenStream.reset();while (tokenStream.incrementToken()) {    System.out.println(offsetAttribute.toString());}

 

 

转载于:https://my.oschina.net/zhuqianli/blog/1583691

你可能感兴趣的文章
android语音朗读功能demo实现
查看>>
CRT或XSHELL工具连接centos7经常自动断开
查看>>
使用 QuickBI 搭建酷炫可视化分析
查看>>
1-1 Zabbix 监控安装
查看>>
最近运维中遇到的一些问题及解决方法
查看>>
Hadoop集群(第10期副刊)_常用MySQL数据库命令
查看>>
LinkedHashMap的实现原理
查看>>
shell 判断语句
查看>>
oracle闪回操作详解
查看>>
浅谈身份、数字身份与电子签约的关系
查看>>
小微企业不注册商标的严重后果
查看>>
2018-11-07 直播笔记
查看>>
正向解析DNS服务
查看>>
ospf与eigrp通过修改管理距离实现路由的负载
查看>>
Oracle 数据库的备份与恢复
查看>>
PLSQL异常处理
查看>>
Oracle 11g r2全外连接优化执行计划(三)
查看>>
tomcat
查看>>
Jenkins 部署
查看>>
我的友情链接
查看>>