How to index and search for files in Lucene

Question

How to index and search for files in Lucene

Navigation

#1 by (3 votes)
#2 by (2 votes)

5

I'm trying to generate a Java file crawler with the help of Lucene. Follow this guide to iMasters and tried to adapt to version 4.7.0 , the problem is that at some point the search is not working. I checked the index file and both the information in the files and their content is being indexed. Could they help me?

Follow my code:

Indexador.java

package main;

import java.io.*;
import java.text.*;

import org.apache.log4j.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.apache.tika.*;

public class Indexador {
    private static Logger logger = Logger.getLogger(Indexador.class);
    // Diretório que irá guardar o índice;
    private String diretorioDosIndices = "C:\Users\strokes"
            + "\Documents\indice-lucene";
    // Diretório que contém os documentos que serão indexados;
    private String diretorioParaIndexar = "C:\Users\strokes"
            + "\Downloads";
    // IndexWriter: cria e mantém o índice;
    private IndexWriter writer;
    // Biblioteca que extrai texto de diversos formatos conhecidos;
    private Tika tika;

    public static void main(String[] args) {
        Indexador indexador = new Indexador();
        indexador.indexaArquivosDoDiretorio();
    }

    public void indexaArquivosDoDiretorio() {
        try {
            File diretorio = new File(diretorioDosIndices);
            apagaIndices(diretorio);
            // Directory: representa o diretório do índice;
            Directory d = new SimpleFSDirectory(diretorio);
            logger.info("Diretório do índice: " + diretorioDosIndices);
            // Analyser/StandardAnalyser: fazem o pré-processamento do texto.
            // Existem analisadores inclusive em português;
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
            // IndexWriterConfig: configurações para criação do índice. No
            // projeto serão utilizados os valores padrão;
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
                    analyzer);
            // Inicializa o IndexWriter para gravação;
            writer = new IndexWriter(d, config);
            long inicio = System.currentTimeMillis();
            indexaArquivosDoDiretorio(new File(diretorioParaIndexar));
            // {12}
            writer.commit();
            writer.close();
            long fim = System.currentTimeMillis();
            logger.info("Tempo para indexar: " + ((fim - inicio) / 1000) + "s");
        } catch (IOException e) {
            logger.error(e);
        }
    }

    private void apagaIndices(File diretorio) {
        if (diretorio.exists()) {
            File arquivos[] = diretorio.listFiles();
            for (File arquivo : arquivos) {
                arquivo.delete();
            }
        }
    }

    public void indexaArquivosDoDiretorio(File raiz) {
        FilenameFilter filtro = new FilenameFilter() {
            public boolean accept(File arquivo, String nome) {
                if (nome.toLowerCase().endsWith(".pdf")
                        || nome.toLowerCase().endsWith(".odt")
                        || nome.toLowerCase().endsWith(".doc")
                        || nome.toLowerCase().endsWith(".docx")
                        || nome.toLowerCase().endsWith(".ppt")
                        || nome.toLowerCase().endsWith(".pptx")
                        || nome.toLowerCase().endsWith(".xls")
                        || nome.toLowerCase().endsWith(".txt")
                        || nome.toLowerCase().endsWith(".rtf")) {
                    return true;
                }
                return false;
            }
        };
        for (File arquivo : raiz.listFiles(filtro)) {
            if (arquivo.isFile()) {
                StringBuffer msg = new StringBuffer();
                msg.append("Indexando o arquivo ");
                msg.append(arquivo.getAbsoluteFile());
                msg.append(", ");
                msg.append(arquivo.length() / 1000);
                msg.append("kb");
                logger.info(msg);
                try {
                    // Extrai o conteúdo do arquivo com o Tika;
                    String textoExtraido = getTika().parseToString(arquivo);
                    indexaArquivo(arquivo, textoExtraido);
                } catch (Exception e) {
                    logger.error(e);
                }
            } else {
                indexaArquivosDoDiretorio(arquivo);
            }
        }
    }

    private void indexaArquivo(File arquivo, String textoExtraido) {
        SimpleDateFormat formatador = new SimpleDateFormat("yyyyMMdd");
        String ultimaModificacao = formatador.format(arquivo.lastModified());
        // Monta um Document para indexação
        // Field.Store.YES: armazena uma cópia do texto no índice, aumentando
        // muito o seu tamanho;
        // Field.Index.ANALYZED: utilizado quando o campo é de texto livre;
        // Field.Index.NOT_ANALYZED: utilizado quando o campo é um ID, data ou
        // númerico.
        Document documento = new Document();
        documento.add(new StringField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
        documento.add(new StringField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
        documento.add(new StringField("Texto", textoExtraido, Field.Store.YES));
        try {
            // Adiciona o Document no índice, mas este só estará disponível para
            // consulta após o commit.
            getWriter().addDocument(documento);
        } catch (IOException e) {
            logger.error(e);
        }
    }

    public Tika getTika() {
        if (tika == null) {
            tika = new Tika();
        }
        return tika;
    }

    public IndexWriter getWriter() {
        return writer;
    }
}

Buscador.java

package main;

import java.io.*;

import javax.swing.*;

import org.apache.log4j.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;

public class Buscador {
    private static Logger logger = Logger.getLogger(Buscador.class);
    private String diretorioDoIndice = "C:\Users\strokes"
            + "\Documents\indice-lucene";

    public void buscaComParser(String parametro) {
        try {
            Directory diretorio = new SimpleFSDirectory(new File(
                    diretorioDoIndice));
            // IndexReader: classe abstrata responsável por acessar o índice;
            IndexReader leitor = DirectoryReader.open(diretorio);
            // IndexSearcher: implementa os métodos necessários para realizar
            // buscas em um índice;
            IndexSearcher buscador = new IndexSearcher(leitor);
            Analyzer analisador = new StandardAnalyzer(Version.LUCENE_47);
            // QueryParser/Query: representa a consulta do usuário. Outros
            // exemplos de query podem ser vistos no Javadoc;
            QueryParser parser = new QueryParser(Version.LUCENE_47, "Texto",
                    analisador);
            Query consulta = parser.parse(parametro);
            long inicio = System.currentTimeMillis();
            // Realiza a busca e armazena o resultado em um TopDocs;
            TopDocs resultado = buscador.search(consulta, 100);
            long fim = System.currentTimeMillis();
            int totalDeOcorrencias = resultado.totalHits;
            logger.info("Total de documentos encontrados:" + totalDeOcorrencias);
            logger.info("Tempo total para busca: " + (fim - inicio) + "ms");
            // ScoreDoc: representa cada um dos documentos retornados na busca.
            for (ScoreDoc sd : resultado.scoreDocs) {
                Document documento = buscador.doc(sd.doc);
                logger.info("Caminho:" + documento.get("Caminho"));
                logger.info("Última modificação:"
                        + documento.get("UltimaModificacao"));
                logger.info("Score:" + sd.score);
                logger.info("--------");
            }
        } catch (Exception e) {
            logger.error(e);
        }
    }

    public static void main(String[] args) {
        Buscador b = new Buscador();
        String parametro = JOptionPane.showInputDialog("Consulta");
        b.buscaComParser(parametro);
    }
}

java lucene

asked by anonymous 17.03.2014 / 19:59

2 answers

2

The Lucene 4 API is different from version 3. Below is a complete example of indexing and searching with Lucene 4.7.

package net.marcoreis.util;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;

public class Indexador {
    private static Logger logger = Logger.getLogger(Indexador.class);
    // {1}
    private String diretorioDosIndices = System.getProperty("user.home")
        + "/indice-lucene";
    // {2}
    private String diretorioParaIndexar = System.getProperty("user.home")
        + "/Dropbox/entrada";
    // {3}
    private IndexWriter writer;
    // {4}
    private Tika tika;

    public static void main(String[] args) {
    Indexador indexador = new Indexador();
    indexador.indexaArquivosDoDiretorio();
    }

    public void indexaArquivosDoDiretorio() {
    try {
        File diretorio = new File(diretorioDosIndices);
        apagaIndices(diretorio);
        // {5}
        Directory d = new SimpleFSDirectory(diretorio);
        logger.info("Diretorio do indice: " + diretorioDosIndices);
        // {6}
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
        // {7}
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
                analyzer);
        // {8}
        writer = new IndexWriter(d, config);
        long inicio = System.currentTimeMillis();
        indexaArquivosDoDiretorio(new File(diretorioParaIndexar));
        // {12}
        writer.commit();
        writer.close();
        long fim = System.currentTimeMillis();
        logger.info("Tempo para indexar: " + ((fim - inicio) / 1000) + "s");
    } catch (IOException e) {
        logger.error(e);
    }
    }

    private void apagaIndices(File diretorio) {
    if (diretorio.exists()) {
        File arquivos[] = diretorio.listFiles();
        for (File arquivo : arquivos) {
            arquivo.delete();
        }
    }
    }

    public void indexaArquivosDoDiretorio(File raiz) {
    FilenameFilter filtro = new FilenameFilter() {
        public boolean accept(File arquivo, String nome) {
            if (nome.toLowerCase().endsWith(".pdf")
                    || nome.toLowerCase().endsWith(".odt")
                    || nome.toLowerCase().endsWith(".doc")
                    || nome.toLowerCase().endsWith(".docx")
                    || nome.toLowerCase().endsWith(".ppt")
                    || nome.toLowerCase().endsWith(".pptx")
                    || nome.toLowerCase().endsWith(".xls")
                    || nome.toLowerCase().endsWith(".txt")
                    || nome.toLowerCase().endsWith(".rtf")) {
                return true;
            }
            return false;
        }
    };
    for (File arquivo : raiz.listFiles(filtro)) {
        if (arquivo.isFile()) {
            StringBuffer msg = new StringBuffer();
            msg.append("Indexando o arquivo ");
            msg.append(arquivo.getAbsoluteFile());
            msg.append(", ");
            msg.append(arquivo.length() / 1000);
            msg.append("kb");
            logger.info(msg);
            try {
                // {9}
                String textoExtraido = getTika().parseToString(arquivo);
                indexaArquivo(arquivo, textoExtraido);
            } catch (Exception e) {
                logger.error(e);
            }
        } else {
            indexaArquivosDoDiretorio(arquivo);
        }
    }
    }

    private void indexaArquivo(File arquivo, String textoExtraido) {
    SimpleDateFormat formatador = new SimpleDateFormat("yyyyMMdd");
    String ultimaModificacao = formatador.format(arquivo.lastModified());
    // {10}
    Document documento = new Document();
    documento.add(new TextField("UltimaModificacao", ultimaModificacao,
            Store.YES));
    documento.add(new TextField("Caminho", arquivo.getAbsolutePath(),
            Store.YES));
    documento.add(new TextField("Texto", textoExtraido, Store.YES));
    try {
        // {11}
        getWriter().addDocument(documento);
    } catch (IOException e) {
        logger.error(e);
    }
    }

    public Tika getTika() {
    if (tika == null) {
        tika = new Tika();
    }
    return tika;
    }

    public IndexWriter getWriter() {
    return writer;
    }
}

~~~~~~~~~~~~~~~~~~~~~

package net.marcoreis.util;

import java.io.File;

import javax.swing.JOptionPane;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class Buscador {
    private static Logger logger = Logger.getLogger(Buscador.class);
    private String diretorioDoIndice = System.getProperty("user.home")
        + "/indice-lucene";

    public void buscaComParser(String parametro) {
    try {
        Directory diretorio = new SimpleFSDirectory(new File(
                diretorioDoIndice));
        // {1}
        IndexReader leitor = DirectoryReader.open(diretorio);
        // {2}
        IndexSearcher buscador = new IndexSearcher(leitor);
        Analyzer analisador = new StandardAnalyzer(Version.LUCENE_47);
        // {3}
        QueryParser parser = new QueryParser(Version.LUCENE_47, "Texto",
                analisador);
        Query consulta = parser.parse(parametro);
        long inicio = System.currentTimeMillis();
        // {4}
        TopDocs resultado = buscador.search(consulta, 100);
        long fim = System.currentTimeMillis();
        int totalDeOcorrencias = resultado.totalHits;
        logger.info("Total de documentos encontrados:" + totalDeOcorrencias);
        logger.info("Tempo total para busca: " + (fim - inicio) + "ms");
        // {5}
        for (ScoreDoc sd : resultado.scoreDocs) {
            Document documento = buscador.doc(sd.doc);
            logger.info("Caminho:" + documento.get("Caminho"));
            logger.info("Ultima modificacao:"
                    + documento.get("UltimaModificacao"));
            logger.info("Score:" + sd.score);
            logger.info("--------");
        }
        leitor.close();
    } catch (Exception e) {
        logger.error(e);
    }
    }

    public static void main(String[] args) {
    Buscador b = new Buscador();
    String parametro = JOptionPane.showInputDialog("Consulta");
    b.buscaComParser(parametro);
    }
}

22.03.2014 / 20:22

Problem setting CSS inside 'style' Disable keyboard in some fields

score 3 · Accepted Answer

Field "Text" is a StringField, which means it is not parsed, and all content will be indexed as the only token. It is effectively the same as in Lucene 3.6, using KeywordAnalyzer, or setting the Field to Field.Index.NOT_ANALYZED. In Lucene 4.X, you should use TextField as an alternative, which is the default field for textual content, and thus be parsed.

Source: Can not index and find files [Java , Lucene, Tika, Log4j]

Part of the modified code in Index.java:

documento.add(new StringField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
documento.add(new StringField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
documento.add(new StringField("Texto", textoExtraido, Field.Store.YES));

To:

documento.add(new TextField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
documento.add(new TextField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
documento.add(new TextField("Texto", textoExtraido, Field.Store.YES));