Nemo_Lucene入门样例 - 第一个demo_Nemo社区_LinkNemo

该文章投稿至Nemo社区 Java 板块复制链接

Lucene入门样例 - 第一个demo

发布于 2017/09/20 18:34 3,262浏览 2回复 6,576字

这两天正好有点空闲，稍稍看了一些lucene的内容。晚点可能会在link-nemo上加入相关的使用。这里先记录一个最简单的样例代码。

Lucene是apache基金会里头比较有名的一个项目了，之前用过基于它的另一个全文检索框架Solr。

需要的依赖jar：


            <groupId>org.apache.lucenegroupId>
            lucene-core</artifactId>
            5.3.1version>
        dependency>
        
            <groupId>org.apache.lucenegroupId>
            lucene-analyzers-common</artifactId>
            5.3.1version>
        dependency>
        
            <groupId>org.apache.lucenegroupId>
            lucene-queryparser</artifactId>
            5.3.1version>
        dependency>
        
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-highlighterartifactId>
            <version>5.3.1version>
        dependency>
        
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-analyzers-smartcnartifactId>
            <version>5.3.1version>
        dependency>
        
        <dependency>
            <groupId>commons-iogroupId>
            <artifactId>commons-ioartifactId>
            <version>2.4version>
        dependency>
        
            <groupId>junitgroupId>
            junit</artifactId>
            4.12version>
        dependency>

索引器：


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

/**
 * 建立索引
 * Created by Nemo on 2017/9/20.
 */
public class Indexer {

    /**
     * 写索引使用的工具
     */
    private IndexWriter writer;

    /**
     * 实例化写索引
     */
    public Indexer(String indexDir)throws Exception{
        //分词器
        Analyzer analyzer=new StandardAnalyzer();
        //写索引配置
        IndexWriterConfig writerConfig=new IndexWriterConfig(analyzer);
        //索引存储在内存
        //Directory ramDirectory= new RAMDirectory();
        //索引存储磁盘位置
        Directory directory= FSDirectory.open(Paths.get(indexDir));
        //实例化一个写索引
        writer=new IndexWriter(directory,writerConfig);
    }

    /**
     * 关闭写索引
     * @throws Exception
     */
    public void close()throws Exception{
        writer.close();
    }

    /**
     * 添加指定目录的所有文件的索引
     * @param dataDir
     * @return
     * @throws Exception
     */
    public int index(String dataDir)throws Exception{
        //得到指定目录的文档列表
        File files [] =new File(dataDir).listFiles();
        for(File file:files){
            //开始建立索引
            indexFile(file);
        }
        return writer.numDocs();
    }

    /**
     * 开始建立索引
     * @param file
     * @throws Exception
     */
    public void indexFile(File file)throws Exception{
        //打印索引到的文件路径信息
        System.out.println("索引文件:"+file.getCanonicalPath());
        //得到一个文档信息，相对一个表记录
        Document document=getDocument(file);
        //写入到索引，相当于插入一个表记录
        writer.addDocument(document);
    }

    /**
     * 创建一个文档记录
     * @param file
     * @return
     * @throws Exception
     */
    public Document getDocument(File file)throws Exception{
        //实例化一个文档
        Document document=new Document();
        //添加一个文档信息，相当于一个数据库表字段
        document.add(new TextField("context",new FileReader(file)));
        //添加文档的名字属性
        document.add(new TextField("fileName",file.getName(), Field.Store.YES));
        //添加文档的路径属性
        document.add(new TextField("filePath",file.getCanonicalPath(),Field.Store.YES));
        return document;
    }

    public static void main(String ages[]){
        //索引存放目录
        String indexDir="F:\\LuceneIndex";
        //需要建立索引的数据文件目录
        String dataDir="F:\\LuceneTestData";
        Indexer indexer=null;
        int indexSum=0;
        try {
            indexer=new Indexer(indexDir);
            indexSum= indexer.index(dataDir);
            System.out.printf("完成"+indexSum+"个文件的索引");
        }catch (Exception e){
            e.printStackTrace();
        }finally {
            try {
                indexer.close();
            }catch (Exception e){
                e.printStackTrace();
            }

        }

    }

}

索引查询：


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.nio.file.Paths;

/**
 * 查询索引
 * Created by Nemo on 2017/9/20.
 */
public class Searcher {

    /**
     * 根据关键字在索引中查询文件
     * @param indexDir
     * @param q
     * @throws Exception
     */
    public static void search(String indexDir,String q)throws Exception{
        //索引地址
        Directory dir= FSDirectory.open(Paths.get(indexDir));
        //读索引
        IndexReader reader= DirectoryReader.open(dir);
        IndexSearcher is=new IndexSearcher(reader);
        // 标准分词器
        Analyzer analyzer=new StandardAnalyzer();
        //指定查询Document的某个属性
        QueryParser parser=new QueryParser("context", analyzer);
        //指定查询索引内容，对应某个分词
        Query query=parser.parse(q);
        //执行搜索
        TopDocs hits=is.search(query, 10);
        System.out.println("匹配 "+q+"查询到"+hits.totalHits+"个记录");
        for(ScoreDoc scoreDoc:hits.scoreDocs){
            Document doc=is.doc(scoreDoc.doc);
            //打印Document的fileName属性
            System.out.println(doc.get("fileName"));
        }
        reader.close();
    }

    public static void main(String args[]) {
        //索引文件目录
        String indexDir="F:\\LuceneIndex";
        //关键字
        String q="正则";
        try {
            //开始查询
            search(indexDir,q);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

本文标签
索引搜索引擎搜索 lucene 全文检索

上一个文章：Lucene 创建全文检索基本原理记录

下一个文章：HTML操作工具 - 正则HTML中获取图片地址列表

点了个评

Nemo

最近回复

Lucene入门样例 - 第一个demo

点击排行

没有找到这位爷的热门文章哦~

最新文章

使用DrissionPage无头模式采集网页信息

Python Selenium获取浏览器中的网咯请求响应

解决pip安装库时提示UnicodeDecodeError: 'gbk' codec can't decode byte 0xaa in position 72: illegal multibyte sequence

陶渊明诗集（收藏版）

Python print如何一行覆盖输出？

论性能过剩

单元测试编码规范

浅谈代码覆盖率

Java & Python 里的泛型

python Selenium 操作工具封装：反反爬虫+内存管理