参考WhitespaceTokenizer写一个叫CommaTokenizer的Tokenizer,继承CharTokenizer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package com.xxx.yyy.zzz.analyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeFactory;
/**
* CommaTokenizer
*
* may see {@link WhitespaceTokenizerFactory}
*/
public final class CommaTokenizer extends CharTokenizer {
public CommaTokenizer() {
}
public CommaTokenizer(AttributeFactory factory) {
super(factory);
}
@Override
protected boolean isTokenChar(int c) {
return !(c == 44);
// return !Character.isWhitespace(c);
}
}
参考WhitespaceTokenizer写一个叫CommaTokenizerFactory的TokenizerFactory,继承TokenizerFactory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
package com.xxx.yyy.xxx.analyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.Map;
public class CommaTokenizerFactory extends TokenizerFactory {
public CommaTokenizerFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public CommaTokenizer create(AttributeFactory factory) {
return new CommaTokenizer(factory);
}
}
schema定义与配置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
<field name="comma_name" type="comma_str" indexed="true" stored="true" omitNorms="true"/>
<field name="comma_pattern_name" type="comma_pattern" indexed="true" stored="true" omitNorms="true"/>
<fieldType name="comma_str" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="com.xxx.yyy.zzz.CommaTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="comma_pattern" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.PatternTokenizerFactory" pattern=", *" />
</analyzer>
</fieldType>
没错,这玩意也可以用PatternTokenizerFactory搞定