||
为了比较不同中文分词算法的分效果,需要对分词的结果进行查看
下面为Lucene中文分词结果的查看
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
public class test {
public static List<String> getAnalyseResult(String analyzeStr, Analyzer analyzer) {
List<String> response = new ArrayList<String>();
TokenStream tokenStream = null;
try {
tokenStream = analyzer.tokenStream("content", new StringReader(analyzeStr));
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
response.add(attr.toString());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (tokenStream != null) {
try {
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return response;
}
public static void main(String[] args) {
try {
//test getwords funtion
String str = "山东省潍坊市高新技术产业开发区樱前街10815号。";
List<String> lists123 = getAnalyseResult(str, new SmartChineseAnalyzer());
for (String s : lists123) {
System.out.println(s);
}
}
}
}
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-9-27 11:50
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社