近义词搜索

发布时间:2017-3-31 22:04:31编辑:www.fx114.net 分享查询网我要评论
本篇文章主要介绍了"近义词搜索",主要涉及到近义词搜索方面的内容,对于近义词搜索感兴趣的同学可以参考一下。

“西语”是“西班牙语”的简称,当我搜“西语”时,我希望搜索结果也包含“西班牙语”。
所以我要为分词器,加上一层过滤器,用于处理近义词。
分词的结果用TokenStream表示,一个TokenStream包含一串Token,每个Token表示一个分词,包含词的内容,在句子中的位置等。
近义词过滤器,要实现的是将同义词加入到TokenStream中,并且和原词是相同的位置。要实现的结果如下所示:

public static void main(String[] args) throws IOException {
        Analyzer analyzer = new SynonymAnalyzer();
        TokenStream stream = analyzer.tokenStream("", "西班牙语学习");
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);  
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

        int position = 0;
        stream.reset();
        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
                System.out.println(position + ": ");
            }
            System.out.println("["+termAtt.toString()+"]");
        }

        stream.close();
    }

结果为:
1:
[西班牙语]
[西语]
2:
[学习]

实现代码如下:

package com.analyzer.test;

import java.io.IOException;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class SynonymFilter extends TokenFilter {
    public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";

    private Stack<String> synonymStack;
    private SynonymEngine engine;
    private AttributeSource.State current;

    // 词元文本属性
    private final CharTermAttribute termAtt;
    // 词元位移属性
    private final PositionIncrementAttribute posIncrAtt;

    protected SynonymFilter(TokenStream input, SynonymEngine engine) {
        super(input);
        synonymStack = new Stack<String>();
        this.engine = engine;

        termAtt = addAttribute(CharTermAttribute.class);
        posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    }

    @Override
    public boolean incrementToken() throws IOException {
        if (synonymStack.size() > 0) {
            String syn = synonymStack.pop();
            restoreState(current);
            termAtt.setEmpty();
            termAtt.append(syn);
            termAtt.setLength(syn.length());
            posIncrAtt.setPositionIncrement(0);
            return true;
        }

        if (!input.incrementToken()) return false;
        if (addAliasesToStack()) {
            current = captureState();
        }
        return true;
    }

    private boolean addAliasesToStack() {
        String[] synonyms = engine.getSynonyms(termAtt.toString());
        if (synonyms == null) return false;
        for (String synonym : synonyms) {
            if (!termAtt.toString().equals(synonym))
                synonymStack.push(synonym);
        }
        return true;
    }
}
package com.analyzer.test;

public interface SynonymEngine {
    String[] getSynonyms(String s);
}
package com.analyzer.test;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;

public class SynonymAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer _IKTokenizer = new IKTokenizer(reader , true);

        TokenStream ts = new SynonymFilter(_IKTokenizer, new TestSynonymEngine());

        return new TokenStreamComponents(_IKTokenizer, ts);

    }

}
package com.analyzer.test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;

public class TestSynonymEngine implements SynonymEngine {

    private static HashMap<String, String[]> map = new HashMap<String, String[]>();

    static {
        try {
            InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream("synonym.txt");
            InputStreamReader reader = new InputStreamReader(in);
            BufferedReader br = new BufferedReader(reader);
            String line = null;
            while ((line = br.readLine()) != null) {
                String[] words = line.split(" ");
                for (int i = 0; i < words.length; i++) {
                    map.put(words[i], words);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public String[] getSynonyms(String s) {
        return map.get(s);
    }

}
package com.pyc.search.searchservice.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.analyzer.test.SynonymAnalyzer;

public class SynonymSegmenter {

    private static Logger logger = Logger.getLogger(SynonymSegmenter.class);

    public String[] analysis(String input) {

        try {
            Analyzer analyzer = new SynonymAnalyzer();
            TokenStream stream = analyzer.tokenStream("", input);
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);  

            List<String> list = new ArrayList<String>();

            stream.reset();
            while (stream.incrementToken()) {
                list.add(termAtt.toString());
            }
            stream.close();

            return list.toArray(new String[0]);
        } catch (IOException e) {
            logger.error(e);
        }

        return null;
    }

}

如有错误,欢迎指正。

参考:

lucene in action(第二版)4.5节 Synomyms, aliases, and words that mean the same 131


上一篇:Intellij Idea15 快捷键设置大全
下一篇:工作的一半或者更多时间在为同伴或者自己填坑

相关文章

关键词: 近义词搜索

相关评论

本站评论功能暂时取消,后续此功能例行通知。

一、不得利用本站危害国家安全、泄露国家秘密,不得侵犯国家社会集体的和公民的合法权益,不得利用本站制作、复制和传播不法有害信息!

二、互相尊重,对自己的言论和行为负责。

好贷网好贷款