Apache Lucene Tutorial: Lucene for Text Search

Apache Lucene Tutorial: Lucene for Text Search

This article looks at how Apache Lucene can be used to perform text based searching. Lucene provides a high-performance text based search capabilities. This is a very easy to use library.

This article applies to Lucene 3.6 (latest release at the time of writing).

The sample code can be found here.

Project Structure:
org.fazlan.lucene.demo  
 |-- pom.xml  
 `-- src  
   `-- main  
     |-- java  
     |  `-- org  
     |    `-- fazlan  
     |      `-- lucene  
     |        `-- demo  
     |          |-- Indexer.java  
     |          |-- IndexItem.java  
     |          |-- Main.java  
     |          `-- Searcher.java  
     `-- resources  
       `-- index  
Step 1: Creating a Maven Project
mvn archetype:generate -DartifactId=org.fazlan.lucene.demo -DgroupId=org.fazlan -Dversion=1.0-SNAPSHOT -DinteractiveMode=false  
Step 2: Updated Maven Dependency (pom.xml)
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">  
  <modelVersion>4.0.0</modelVersion>  
  <groupId>org.fazlan</groupId>  
  <artifactId>org.fazlan.lucene.demo</artifactId>  
  <packaging>jar</packaging>  
  <version>1.0-SNAPSHOT</version>  
  <name>org.fazlan.lucene.demo</name>  
  <url>http://maven.apache.org</url>  
  <dependencies>  
   <dependency>  
      <groupId>org.apache.lucene</groupId>  
      <artifactId>lucene-core</artifactId>  
      <version>3.6.0</version>  
   </dependency>  
   <dependency>  
    <groupId>junit</groupId>  
    <artifactId>junit</artifactId>  
    <version>3.8.1</version>  
    <scope>test</scope>  
   </dependency>  
  </dependencies>  
 </project>
Step 3: Defining the POJO Class used to Index Items

 package org.fazlan.lucene.demo;  
 public class IndexItem {  
   private Long id;  
   private String title;  
   private String content;  
   public static final String ID = "id";  
   public static final String TITLE = "title";  
   public static final String CONTENT = "content";  
   public IndexItem(Long id, String title, String content) {  
     this.id = id;  
     this.title = title;  
     this.content = content;  
   }  
   public Long getId() {  
     return id;  
   }  
   public String getTitle() {  
     return title;  
   }  
   public String getContent() {  
     return content;  
   }  
   @Override  
   public String toString() {  
     return "IndexItem{" +  
         "id=" + id +  
         ", title='" + title + '\'' +  
         ", content='" + content + '\'' +  
         '}';  
   }  
 }  
Step 4: Defining the Indexer
package org.fazlan.lucene.demo;  
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  
 import org.apache.lucene.document.Document;  
 import org.apache.lucene.document.Field;  
 import org.apache.lucene.index.IndexWriter;  
 import org.apache.lucene.index.IndexWriterConfig;  
 import org.apache.lucene.index.Term;  
 import org.apache.lucene.store.FSDirectory;  
 import org.apache.lucene.util.Version;  
 import java.io.File;  
 import java.io.IOException;  
 public class Indexer  
 {  
   private IndexWriter writer;  
   public Indexer(String indexDir) throws IOException {  
     // create the index  
     if(writer == null) {  
         writer = new IndexWriter(FSDirectory.open(  
           new File(indexDir)), new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));  
     }  
   }  
   /**   
    * This method will add the items into index  
    */  
   public void index(IndexItem indexItem) throws IOException {  
     // deleting the item, if already exists  
     writer.deleteDocuments(new Term(IndexItem.ID, indexItem.getId().toString()));  
     Document doc = new Document();  
     doc.add(new Field(IndexItem.ID, indexItem.getId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));  
     doc.add(new Field(IndexItem.TITLE, indexItem.getTitle(), Field.Store.YES, Field.Index.ANALYZED));  
     doc.add(new Field(IndexItem.CONTENT, indexItem.getContent(), Field.Store.YES, Field.Index.ANALYZED));  
     // add the document to the index  
     writer.addDocument(doc);  
   }  
   /**  
    * Closing the index  
    */  
   public void close() throws IOException {  
     writer.close();  
   }  
 }  
Field.Store.YES: if you need to store the value, so that the value can be retrieved from the searched result.
Field.Index.ANALYZED: Index the tokens produced by running the field's value through an Analyzer.

Field.Index.NOT_ANALYZED: Index the field's value without using an Analyzer, so it can be searched.

Step 5: Defining the Searcher
package org.fazlan.lucene.demo;  
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  
 import org.apache.lucene.document.Document;  
 import org.apache.lucene.index.IndexReader;  
 import org.apache.lucene.queryParser.ParseException;  
 import org.apache.lucene.queryParser.QueryParser;  
 import org.apache.lucene.search.*;  
 import org.apache.lucene.store.FSDirectory;  
 import org.apache.lucene.util.Version;  
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.List;  

 public class Searcher {  

   private IndexSearcher searcher;  
   private QueryParser titleQueryParser;  
   private QueryParser contentQueryParser;  

   public Searcher(String indexDir) throws IOException {  
     // open the index directory to search  
     searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(indexDir))));  
     StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);  
     // defining the query parser to search items by title field.  
     titleQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.TITLE, analyzer);  
     // defining the query parser to search items by content field.  
     contentQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.CONTENT, analyzer);  
   }  

   /**  
    * This method is used to find the indexed items by the title.  
    * @param queryString - the query string to search for  
    */  
   public List<indexitem> findByTitle(String queryString, int numOfResults) throws ParseException, IOException {  
     // create query from the incoming query string.  
     Query query = titleQueryParser.parse(queryString);  
     // execute the query and get the results  
     ScoreDoc[] queryResults = searcher.search(query, numOfResults).scoreDocs;  
     List<indexitem> results = new ArrayList<indexitem>();  
     // process the results  
     for (ScoreDoc scoreDoc : queryResults) {  
       Document doc = searcher.doc(scoreDoc.doc);  
       results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE), doc.get(IndexItem  
           .CONTENT)));  
     }  
      return results;  
   }  

   /**  
    * This method is used to find the indexed items by the content.  
    * @param queryString - the query string to search for  
    */  
   public List<indexitem> findByContent(String queryString, int numOfResults) throws ParseException, IOException {  
     // create query from the incoming query string.  
     Query query = contentQueryParser.parse(queryString);  
     // execute the query and get the results  
     ScoreDoc[] queryResults = searcher.search(query, numOfResults).scoreDocs;  
     List<indexitem> results = new ArrayList<indexitem>();  
     // process the results  
     for (ScoreDoc scoreDoc : queryResults) {  
       Document doc = searcher.doc(scoreDoc.doc);  
       results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE), doc.get(IndexItem  
           .CONTENT)));  
     }  
      return results;  
   }  

   public void close() throws IOException {  
     searcher.close();  
   }  
 }  
</indexitem></indexitem></indexitem></indexitem></indexitem></indexitem>
Step 6: The Application using the Indexer and the Searcher
package org.fazlan.lucene.demo; 
 
 import org.apache.lucene.queryParser.ParseException;  
 import java.io.BufferedReader;  
 import java.io.IOException;  
 import java.io.InputStreamReader;  
 import java.util.List;  

 public class Main {  
   // location where the index will be stored.  
   private static final String INDEX_DIR = "src/main/resources/index";  
   private static final int DEFAULT_RESULT_SIZE = 100;  

   public static void main(String[] args) throws IOException, ParseException {  
     // the items to be indexed  
     IndexItem[] indexItems = {  
         new IndexItem(1L, "Java in Action", "This is Java in Action Book"),  
         new IndexItem(2L, "Spring in Action", "This is Spring in Action Book"),  
         new IndexItem(3L, "Hibernate in Action", "This is Hibernate in Action Book"),  
         new IndexItem(4L, "SOA in Action", "This is SOA in Action Book"),  
         new IndexItem(5L, "Apache Axis2 in Action", "This is Axis2 in Action Book"),  
         new IndexItem(6L, "Apache CXF in Action", "This is CXF in Action Book"),  
         new IndexItem(7L, "jQuery in Action", "This is jQuery in Action Book")};  

     // creating the indexer and indexing the items  
     Indexer indexer = new Indexer(INDEX_DIR);  
     for (IndexItem indexItem : indexItems) {  
       indexer.index(indexItem);  
     }  

     // close the index to enable them index  
     indexer.close();  

     BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));  
     String input;  
     System.out.println("Type Q/q to quit.");  
     System.out.println("Type 1 query by title.");  
     System.out.println("Type 2 query by content.");  

     // creating the Searcher to the same index location as the Indexer  
     Searcher searcher = new Searcher(INDEX_DIR);  

     do {  
       System.out.print("Enter input: ");  
       input = reader.readLine();  
       if (input.equalsIgnoreCase("q")) {  
         break;  
       }  

       // search by title  
       if (input.equals("1")) {  
         System.out.print("Enter title to search: ");  
         input = reader.readLine();  
         List<IndexItem> result = searcher.findByTitle(input, DEFAULT_RESULT_SIZE);  
         print(result);  

       } else if (input.equals("2")) { // else, search by content  
         System.out.print("Enter content to search: ");  
         input = reader.readLine();  
         List<IndexItem> result = searcher.findByContent(input, DEFAULT_RESULT_SIZE);  
         print(result);  
       }  
     } while (true);  

     searcher.close();  
   }  

   /**  
    * print the results.  
    */  
   private static void print(List<IndexItem> result) {  
     System.out.println("Result Size: " + result.size());  
     for (IndexItem item : result) {  
       System.out.println(item);  
     }  
   }  
 }
Summary:
This article looked at how you can easily introduce text based indexing into your application using Apache Lucene.

The sample code can be found here.

0 comments:

Post a Comment