나는 여기에 도움이 필요하다 일반적으로 사용하는 유틸리티 방법입니다.
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.10.3</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.10.3</version>
</dependency>
:
private Set<String> generateNgrams(String sentence, int ngramCount) {
StringReader reader = new StringReader(sentence);
Set<String> ngrams = new HashSet<>();
//use lucene's shingle filter to generate the tokens
StandardTokenizer source = new StandardTokenizer(reader);
TokenStream tokenStream = new StandardFilter(source);
TokenFilter sf = null;
//if only unigrams are needed use standard filter else use shingle filter
if(ngramCount == 1){
sf = new StandardFilter(tokenStream);
}
else{
sf = new ShingleFilter(tokenStream);
((ShingleFilter)sf).setMaxShingleSize(ngramCount);
}
CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
try {
sf.reset();
while (sf.incrementToken()) {
String token = charTermAttribute.toString().toLowerCase();
ngrams.add(token);
}
sf.end();
sf.close();
} catch (IOException ex) {
// System.err.println("Scream and cry as desired");
ex.printStackTrace();
}
return ngrams;
}
메이븐 루씬에 필요한 종속성이 있습니다 (I가 낮거나 높은 버전에서 테스트하지 않았다) 루씬 4.10와 함께 작동해야