1 # -*- coding: utf-8 -*-
2 """wrapper: Wrapper interface to Full-text search system 'lucene'"""
5 from java.io import File
6 from org.apache.lucene.util import Version
7 from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader, Term
8 from org.apache.lucene.search import (IndexSearcher, BooleanClause, TopScoreDocCollector,
9 TotalHitCountCollector,
10 TermQuery as _TermQuery,
11 BooleanQuery as _BooleanQuery)
12 from org.apache.lucene.analysis.ja import JapaneseAnalyzer, JapaneseTokenizer
13 from org.apache.lucene.store import SimpleFSDirectory
14 from org.apache.lucene.document import (Field, TextField, StringField, IntField,
15 Document as LuceneDocument)
16 from org.apache.lucene.queryparser.flexible.standard import StandardQueryParser
21 class LuceneWrapper(object):
22 def __init__(self, **kwargs):
24 self._version = Version.LUCENE_4_10_1
26 def _get_analyzer(self):
27 # create Japanese analyzer
28 mode = JapaneseTokenizer.Mode.NORMAL
29 stop_set = JapaneseAnalyzer.getDefaultStopSet()
30 stop_tags = JapaneseAnalyzer.getDefaultStopTags()
31 return JapaneseAnalyzer(None, mode, stop_set, stop_tags)
33 def _get_index_directory(self):
34 return SimpleFSDirectory(File(self._config["index_directory"]))
36 class Indexer(LuceneWrapper):
37 def __init__(self, **kwargs):
38 super(Indexer, self).__init__(**kwargs)
41 def _get_writer(self):
42 analyzer = self._get_analyzer()
44 # create IndexWriterConfig
45 lucene_conf = IndexWriterConfig(self._version, analyzer)
48 index_dir = self._get_index_directory()
51 self._writer = IndexWriter(index_dir, lucene_conf)
54 def _close_writer(self):
61 self._writer.addDocument(doc.doc)
68 def __exit__(self, exc_type, exc_value, traceback):
72 class Document(LuceneWrapper):
74 super(Document, self).__init__()
75 self.doc = LuceneDocument()
77 def _store_flag(self, store):
79 return Field.Store.YES
82 def add_string_field(self, name, value, store=True):
83 self.doc.add(StringField(name, value, self._store_flag(store)))
86 def add_text_field(self, name, value, store=True):
87 self.doc.add(TextField(name, value, self._store_flag(store)))
90 def add_int_field(self, name, value, store=True):
91 self.doc.add(IntField(name, value, self._store_flag(store)))
95 class Searcher(LuceneWrapper):
96 def __init__(self, **kwargs):
97 super(Searcher, self).__init__(**kwargs)
100 def _get_searcher(self):
101 index_dir = self._get_index_directory()
102 reader = IndexReader.open(index_dir)
103 self._searcher = IndexSearcher(reader)
106 def search2(self, query, limit=1000, offset=0):
107 col = TopScoreDocCollector.create(limit + offset, True)
108 self._searcher.search(query.query, col)
109 docs = col.topDocs(offset, limit)
110 res = SearchResults(docs, self._searcher)
114 def search(self, query, max_result=1000):
115 docs = self._searcher.search(query.query, max_result)
116 return SearchResults(docs, self._searcher)
118 def raw_search(self, query, max_result=1000):
119 docs = self._searcher.search(query, max_result)
120 return SearchResults(docs, self._searcher)
123 class SearchResults(LuceneWrapper):
124 def __init__(self, docs, searcher):
126 self.searcher = searcher
127 self.total_hits = docs.totalHits
130 def __getitem__(self, key):
131 return ScoredDocument(self.docs.scoreDocs.__getitem__(key), self.searcher)
134 self._iter = iter(self.docs.scoreDocs)
138 return self.__next__()
144 doc = next(self._iter)
145 return ScoredDocument(doc, self.searcher)
148 class ScoredDocument(LuceneWrapper):
149 def __init__(self, doc, searcher):
150 super(ScoredDocument, self).__init__()
152 self._searcher_doc = searcher.doc(doc.doc)
153 self.number = doc.doc
154 self.score = doc.score
155 self.searcher = searcher
157 def __getattr__(self, name):
158 val = self._searcher_doc.get(name)
162 class BooleanQuery(LuceneWrapper):
164 super(BooleanQuery, self).__init__()
165 self.query = _BooleanQuery()
167 def add_must(self, query):
168 self.query.add(query.query, BooleanClause.Occur.MUST)
171 class TermQuery(LuceneWrapper):
172 def __init__(self, field_name, query_term):
173 super(TermQuery, self).__init__()
175 term = Term(field_name, query_term)
176 self.query = _TermQuery(term)
179 class Query(LuceneWrapper):
180 def __init__(self, field_name, query_text):
181 super(Query, self).__init__()
183 parser = StandardQueryParser()
184 parser.setAllowLeadingWildcard(True);
185 parser.setAnalyzer(self._get_analyzer())
188 self.query = parser.parse(query_text, field_name)
189 except lucene.JavaError as e:
190 raise QueryParseError(e.getJavaException().getMessage())
193 class LuceneWrapperError(Exception):
196 class QueryParseError(LuceneWrapperError):
197 def __init__(self, message):
198 self.message = message