OSDN Git Service

ns_search: lucene_wrapper supports BooleanQuery
[newslash/newslash.git] / src / ns_search / lucene_wrapper / wrapper.py
1 # -*- coding: utf-8 -*-
2 """wrapper: Wrapper interface to Full-text search system 'lucene'"""
3
4 import lucene
5 from java.io import File
6 from org.apache.lucene.util import Version
7 from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader, Term
8 from org.apache.lucene.search import (IndexSearcher, BooleanClause, TopScoreDocCollector,
9                                       TotalHitCountCollector,
10                                       TermQuery as _TermQuery,
11                                       BooleanQuery as _BooleanQuery)
12 from org.apache.lucene.analysis.ja import JapaneseAnalyzer, JapaneseTokenizer
13 from org.apache.lucene.store import SimpleFSDirectory
14 from org.apache.lucene.document import (Field, TextField, StringField, IntField,
15                                         Document as LuceneDocument)
16 from org.apache.lucene.queryparser.flexible.standard import StandardQueryParser
17
18 def init_vm():
19     lucene.initVM()
20
21 class LuceneWrapper(object):
22     def __init__(self, **kwargs):
23         self._config = kwargs
24         self._version = Version.LUCENE_4_10_1
25
26     def _get_analyzer(self):
27         # create Japanese analyzer
28         mode = JapaneseTokenizer.Mode.NORMAL
29         stop_set = JapaneseAnalyzer.getDefaultStopSet()
30         stop_tags = JapaneseAnalyzer.getDefaultStopTags()
31         return JapaneseAnalyzer(None, mode, stop_set, stop_tags)
32
33     def _get_index_directory(self):
34         return SimpleFSDirectory(File(self._config["index_directory"]))
35
36 class Indexer(LuceneWrapper):
37     def __init__(self, **kwargs):
38         super(Indexer, self).__init__(**kwargs)
39         self._writer = None
40     
41     def _get_writer(self):
42         analyzer = self._get_analyzer()
43
44         # create IndexWriterConfig
45         lucene_conf = IndexWriterConfig(self._version, analyzer)
46
47         # index directory
48         index_dir = self._get_index_directory()
49
50         # create IndexWriter
51         self._writer = IndexWriter(index_dir, lucene_conf)
52         return self
53
54     def _close_writer(self):
55         if self._writer:
56             self._writer.close()
57             self._writer = None
58         return self
59
60     def add(self, doc):
61         self._writer.addDocument(doc.doc)
62         return self
63
64     def __enter__(self):
65         self._get_writer()
66         return self
67
68     def __exit__(self, exc_type, exc_value, traceback):
69         self._close_writer()
70     
71
72 class Document(LuceneWrapper):
73     def __init__(self):
74         super(Document, self).__init__()
75         self.doc = LuceneDocument()
76
77     def _store_flag(self, store):
78         if store:
79             return Field.Store.YES
80         return Field.Store.NO
81
82     def add_string_field(self, name, value, store=True):
83         self.doc.add(StringField(name, value, self._store_flag(store)))
84         return self
85
86     def add_text_field(self, name, value, store=True):
87         self.doc.add(TextField(name, value, self._store_flag(store)))
88         return self
89
90     def add_int_field(self, name, value, store=True):
91         self.doc.add(IntField(name, value, self._store_flag(store)))
92         return self
93
94
95 class Searcher(LuceneWrapper):
96     def __init__(self, **kwargs):
97         super(Searcher, self).__init__(**kwargs)
98         self._get_searcher()
99
100     def _get_searcher(self):
101         index_dir = self._get_index_directory()
102         reader = IndexReader.open(index_dir)
103         self._searcher = IndexSearcher(reader)
104         return self
105
106     def search2(self, query, limit=1000, offset=0):
107         col = TopScoreDocCollector.create(limit + offset, True)
108         self._searcher.search(query.query, col)
109         docs = col.topDocs(offset, limit)
110         res = SearchResults(docs, self._searcher)
111
112         return res
113
114     def search(self, query, max_result=1000):
115         docs = self._searcher.search(query.query, max_result)
116         return SearchResults(docs, self._searcher)
117
118     def raw_search(self, query, max_result=1000):
119         docs = self._searcher.search(query, max_result)
120         return SearchResults(docs, self._searcher)
121
122
123 class SearchResults(LuceneWrapper):
124     def __init__(self, docs, searcher):
125         self.docs = docs
126         self.searcher = searcher
127         self.total_hits = docs.totalHits
128         self._iter = None
129
130     def __getitem__(self, key):
131         return ScoredDocument(self.docs.scoreDocs.__getitem__(key), self.searcher)
132     
133     def __iter__(self):
134         self._iter = iter(self.docs.scoreDocs)
135         return self
136
137     def next(self):
138         return self.__next__()
139
140     def __next__(self):
141         if not self._iter:
142             self.__iter__()
143
144         doc = next(self._iter)
145         return ScoredDocument(doc, self.searcher)
146         
147
148 class ScoredDocument(LuceneWrapper):
149     def __init__(self, doc, searcher):
150         super(ScoredDocument, self).__init__()
151         self.doc = doc
152         self._searcher_doc = searcher.doc(doc.doc)
153         self.number = doc.doc
154         self.score = doc.score
155         self.searcher = searcher
156
157     def __getattr__(self, name):
158         val = self._searcher_doc.get(name)
159         return val
160
161
162 class BooleanQuery(LuceneWrapper):
163     def __init__(self):
164         super(BooleanQuery, self).__init__()
165         self.query = _BooleanQuery()
166
167     def add_must(self, query):
168         self.query.add(query.query, BooleanClause.Occur.MUST)
169
170
171 class TermQuery(LuceneWrapper):
172     def __init__(self, field_name, query_term):
173         super(TermQuery, self).__init__()
174
175         term = Term(field_name, query_term)
176         self.query = _TermQuery(term)
177
178
179 class Query(LuceneWrapper):
180     def __init__(self, field_name, query_text):
181         super(Query, self).__init__()
182
183         parser = StandardQueryParser()
184         parser.setAllowLeadingWildcard(True);
185         parser.setAnalyzer(self._get_analyzer())
186         
187         try:
188             self.query = parser.parse(query_text, field_name)
189         except lucene.JavaError as e:
190             raise QueryParseError(e.getJavaException().getMessage())
191
192
193 class LuceneWrapperError(Exception):
194     pass
195
196 class QueryParseError(LuceneWrapperError):
197     def __init__(self, message):
198         self.message = message
199