OSDN Git Service

ns_search: add TermQuery class
[newslash/newslash.git] / src / ns_search / lucene_wrapper / wrapper.py
1 # -*- coding: utf-8 -*-
2 """wrapper: Wrapper interface to Full-text search system 'lucene'"""
3
4 import lucene
5 from java.io import File
6 from org.apache.lucene.util import Version
7 from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader, Term
8 from org.apache.lucene.search import IndexSearcher, TermQuery as _TermQuery
9 from org.apache.lucene.analysis.ja import JapaneseAnalyzer, JapaneseTokenizer
10 from org.apache.lucene.store import SimpleFSDirectory
11 from org.apache.lucene.document import Document as LuceneDocument, Field, TextField, StringField, IntField
12 #from org.apache.lucene.queryparser.classic import QueryParser
13 from org.apache.lucene.queryparser.flexible.standard import StandardQueryParser
14
15 def init_vm():
16     lucene.initVM()
17
18 class LuceneWrapper(object):
19     def __init__(self, **kwargs):
20         self._config = kwargs
21         self._version = Version.LUCENE_4_10_1
22
23     def _get_analyzer(self):
24         # create Japanese analyzer
25         mode = JapaneseTokenizer.Mode.NORMAL
26         stop_set = JapaneseAnalyzer.getDefaultStopSet()
27         stop_tags = JapaneseAnalyzer.getDefaultStopTags()
28         return JapaneseAnalyzer(None, mode, stop_set, stop_tags)
29
30     def _get_index_directory(self):
31         return SimpleFSDirectory(File(self._config["index_directory"]))
32
33 class Indexer(LuceneWrapper):
34     def __init__(self, **kwargs):
35         super(Indexer, self).__init__(**kwargs)
36         self._writer = None
37     
38     def _get_writer(self):
39         analyzer = self._get_analyzer()
40
41         # create IndexWriterConfig
42         lucene_conf = IndexWriterConfig(self._version, analyzer)
43
44         # index directory
45         index_dir = self._get_index_directory()
46
47         # create IndexWriter
48         self._writer = IndexWriter(index_dir, lucene_conf)
49         return self
50
51     def _close_writer(self):
52         if self._writer:
53             self._writer.close()
54             self._writer = None
55         return self
56
57     def add(self, doc):
58         self._writer.addDocument(doc.doc)
59         return self
60
61     def __enter__(self):
62         self._get_writer()
63         return self
64
65     def __exit__(self, exc_type, exc_value, traceback):
66         self._close_writer()
67     
68
69 class Document(LuceneWrapper):
70     def __init__(self):
71         super(Document, self).__init__()
72         self.doc = LuceneDocument()
73
74     def _store_flag(self, store):
75         if store:
76             return Field.Store.YES
77         return Field.Store.NO
78
79     def add_string_field(self, name, value, store=True):
80         self.doc.add(StringField(name, value, self._store_flag(store)))
81         return self
82
83     def add_text_field(self, name, value, store=True):
84         self.doc.add(TextField(name, value, self._store_flag(store)))
85         return self
86
87     def add_int_field(self, name, value, store=True):
88         self.doc.add(IntField(name, value, self._store_flag(store)))
89         return self
90
91
92 class Searcher(LuceneWrapper):
93     def __init__(self, **kwargs):
94         super(Searcher, self).__init__(**kwargs)
95         self._get_searcher()
96
97     def _get_searcher(self):
98         index_dir = self._get_index_directory()
99         reader = IndexReader.open(index_dir)
100         self._searcher = IndexSearcher(reader)
101         return self
102
103     def search(self, query, max_result=1000):
104         docs = self._searcher.search(query.query, max_result)
105         return SearchResults(docs, self._searcher)
106
107     def raw_search(self, query, max_result=1000):
108         docs = self._searcher.search(query, max_result)
109         return SearchResults(docs, self._searcher)
110
111
112 class SearchResults(LuceneWrapper):
113     def __init__(self, docs, searcher):
114         self.docs = docs
115         self.searcher = searcher
116         self.total_hits = docs.totalHits
117         self._iter = None
118
119     def __iter__(self):
120         self._iter = iter(self.docs.scoreDocs)
121         return self
122
123     def next(self):
124         return self.__next__()
125
126     def __next__(self):
127         if not self._iter:
128             self.__iter__()
129
130         doc = next(self._iter)
131         return ScoredDocument(doc, self.searcher)
132         
133
134 class ScoredDocument(LuceneWrapper):
135     def __init__(self, doc, searcher):
136         super(ScoredDocument, self).__init__()
137         self.doc = doc
138         self._searcher_doc = searcher.doc(doc.doc)
139         self.number = doc.doc
140         self.score = doc.score
141         self.searcher = searcher
142
143     def __getattr__(self, name):
144         val = self._searcher_doc.get(name)
145         return val
146
147
148 class TermQuery(LuceneWrapper):
149     def __init__(self, field_name, query_term):
150         super(TermQuery, self).__init__()
151
152         term = Term(field_name, query_term)
153         self.query = _TermQuery(term)
154
155
156 class Query(LuceneWrapper):
157     def __init__(self, field_name, query_text):
158         super(Query, self).__init__()
159
160         parser = StandardQueryParser()
161         parser.setAllowLeadingWildcard(True);
162         parser.setAnalyzer(self._get_analyzer())
163         
164         try:
165             self.query = parser.parse(query_text, field_name)
166         except lucene.JavaError as e:
167             raise QueryParseError(e.getJavaException().getMessage())
168
169
170 class LuceneWrapperError(Exception):
171     pass
172
173 class QueryParseError(LuceneWrapperError):
174     def __init__(self, message):
175         self.message = message
176