OSDN Git Service

2cfe99a8bf826225367a20beecd7e41500abc7d3
[newslash/newslash.git] / src / ns_search / searchd_cli.py
1 #!/usr/bin/python
2 # -*- coding: utf-8
3 from __future__ import print_function
4
5 import argparse
6 import os.path
7 import os
8 import sys
9 import time
10 import inspect
11
12 from yaml import load
13 try:
14     from yaml import CLoader as Loader, CDumper as Dumper
15 except ImportError:
16     from yaml import Loader, Dumper
17
18 import lucene_wrapper
19 from newslash_index import Index, DatabaseError
20 from newslash_db import NewslashDB
21
22 class SearchCLIError(Exception):
23     def __init__(self, message):
24         self.message = message
25
26 class SearchCLI(object):
27     def __init__(self):
28         self.index_dir = None
29         self._parse_args()
30
31         config_path = os.environ.get("SEARCHD_CONFIG", "/etc/newslash/searchd.conf")
32         if not os.path.exists(config_path):
33             base_dir = os.path.dirname(os.path.realpath(__file__))
34             config_path = os.path.join(base_dir, 'searchd.conf')
35         if not os.path.exists(config_path):
36             raise SearchCLIError("config file not found")
37
38         self._load_config(config_path)
39
40     def _make_parser(self):
41         # parse command line option
42         parser = argparse.ArgumentParser(description='search daemon for Newslash')
43         parser.add_argument('-i', '--index-dir', help='lucene index directory')
44
45         # subcommands
46         subparsers = parser.add_subparsers(help="subcommands")
47
48         # 'query' subcommand
49         p_query = subparsers.add_parser("query", help="execute query")
50         p_query.add_argument('subcommand', action='store_const', const='query')
51         p_query.add_argument('query_string', help='query string')
52
53         # 'index' subcommand
54         p_index = subparsers.add_parser("index", help="manipulate lucene index")
55         p_index.add_argument('subcommand', action='store_const', const='index')
56         p_index.add_argument('action', help='action')
57         p_index.add_argument('target', help='target', default='all')
58
59         # 'getdocument' subcommand
60         p_getdoc = subparsers.add_parser("getdocument", help="get document from lucene index")
61         p_getdoc.add_argument('subcommand', action='store_const', const='getdocument')
62         p_getdoc.add_argument('target_type', help='target type')
63         p_getdoc.add_argument('target_id', help='target id')
64
65         # 'initdb' subcommand
66         p_initdb = subparsers.add_parser("initdb", help="create table which stores index related information")
67         p_initdb.add_argument('subcommand', action='store_const', const='initdb')
68
69         # 'analyze' subcommand
70         p_analyze = subparsers.add_parser("analyze", help="analyze index")
71         p_analyze.add_argument('subcommand', action='store_const', const='analyze')
72         p_analyze.add_argument('query_string', help='query string')
73         p_analyze.add_argument('-n', '--number-of-result', default=10, type=int, help='number of result output')
74
75         # 'metadata' subcommand
76         p_metadata = subparsers.add_parser("metadata", help="manipulate metadata")
77         p_metadata.add_argument('subcommand', action='store_const', const='metadata')
78         p_metadata.add_argument('action', help='action')
79
80         return parser
81
82     def _parse_args(self):
83         self.conf = {}
84         parser = self._make_parser()
85         self.args = parser.parse_args()
86
87         # index directory
88         if self.args.index_dir:
89             self.index_dir = os.path.abspath(self.args.index_dir)
90
91         # command
92         try:
93             self.sub_command = self.args.subcommand or ""
94         except AttributeError:
95             self.sub_command = ""
96             return
97
98     def _load_config(self, pathname):
99         try:
100             fh = open(pathname)
101         except IOError:
102             sys.stderr.write('config file ({}) not found...'.format(pathname))
103             self.config = {}
104             return
105
106         self.config = load(fh, Loader=Loader)
107         fh.close()
108
109         d = self.config.get("Searchd", {})
110         try:
111             self.index_dir = os.path.abspath(d["index_path"])
112         except KeyError:
113             raise SearchCLIError("index_path not given")
114
115         self.database = self.config.get("Database", {})
116             
117     def show_help(self):
118         parser = self._make_parser()
119         parser.print_help()
120
121     def metadata(self):
122         if self.args.action == 'show':
123             index = Index(database=self.database, index_path=self.index_dir)
124             for data in index.metadata.get_all():
125                 print(data)
126
127     def index(self):
128         action = self.args.action
129         target = self.args.target
130
131         lucene_wrapper.init_vm()
132         index = Index(database=self.database, index_path=self.index_dir)
133
134         def progress_cb(target, phase, success, errors):
135             print("{}: {} to index {} items... ({} errors)".format(target, phase, success, errors))
136
137         def error_cb(target, phase, item):
138             print("{}: indexing {} error: id={}".format(target, phase, item["sid"]))
139
140         if action == 'clear':
141             index.delete_all()
142             index.metadata.delete_all()
143             print("clear all index and metadata done.")
144         elif action == 'update':
145             start_time = time.time()
146             if target == 'all':
147                 index.update_all(progress_cb=progress_cb, error_cb=error_cb)
148             elif target == 'story':
149                 index.update_story(progress_cb=progress_cb, error_cb=error_cb)
150             elif target == 'comment':
151                 index.update_comment(progress_cb=progress_cb, error_cb=error_cb)
152             elif target == 'journal':
153                 index.update_journal(progress_cb=progress_cb, error_cb=error_cb)
154             elif target == 'submission':
155                 index.update_submission(progress_cb=progress_cb, error_cb=error_cb)
156             elif target == 'poll':
157                 index.update_poll(progress_cb=progress_cb, error_cb=error_cb)
158             elif target == 'user':
159                 index.update_user(progress_cb=progress_cb, error_cb=error_cb)
160             print("indexing done. total time: {}s".format(time.time() - start_time))
161
162     def query(self):
163         if self.index_dir is None:
164             sys.stderr.write("error: index directory not given\n")
165             return
166
167         query_string = self.args.query_string
168         lucene_wrapper.init_vm()
169         
170         searcher = lucene_wrapper.Searcher(index_directory=self.index_dir)
171         try:
172             query = lucene_wrapper.Query("content_text", query_string)
173         except lucene_wrapper.QueryParseError as e:
174             sys.stderr.write("query parse error\n")
175             return
176
177         result = searcher.search(query)
178        
179         print("total hits: {}".format(result.total_hits))
180         for item in result:
181             print("#{} - {}: {}".format(item.number, item.id, item.content_text.encode('utf-8')))
182
183     def analyze(self):
184         if self.index_dir is None:
185             sys.stderr.write("error: index directory not given\n")
186             return
187
188         query_string = self.args.query_string
189         limit = self.args.number_of_result
190
191         lucene_wrapper.init_vm()
192         
193         searcher = lucene_wrapper.Searcher(index_directory=self.index_dir)
194         try:
195             content_query = lucene_wrapper.Query("content_text", query_string)
196             title_query = lucene_wrapper.Query("title", query_string)
197             query = lucene_wrapper.BooleanQuery()
198             query.set_minimum_nubmber_should_match(1)
199             query.add_should(content_query)
200             query.add_should(title_query)
201         except lucene_wrapper.QueryParseError as e:
202             sys.stderr.write("query parse error\n")
203             return
204
205         sort = lucene_wrapper.Sort("create_time", lucene_wrapper.Sort.INT, True)
206         result = searcher.search(query, limit, 0, sort)
207         
208         print("total hits: {}".format(result.total_hits))
209         for item in result:
210             print("#{} - {}:".format(item.number, item.id))
211
212             # get fields
213             fields = item.get_fields()
214             for field in fields:
215                 if field.name == "content_text":
216                     print("  {}: {}".format(field.name, field.value.encode("utf8")))
217                     print("----")
218                     for term in field.get_tokens():
219                         sys.stdout.write("{} ".format(term.encode("utf8")))
220                     print("----")
221                 else:
222                     print("  {}: {}".format(field.name, field.value.encode("utf8")))
223             
224             print("\n")
225
226     def getdocument(self):
227         lucene_wrapper.init_vm()
228         index = Index(database=self.database, index_path=self.index_dir)
229         result = index.get(self.args.target_type, self.args.target_id)
230         if result is None:
231             print("no item")
232             return
233
234         if result.totalhits > 1:
235             print("warning: hits multiple items")
236
237         print("#{} - {}: {}".format(result.number, result.id, result.content_text.encode('utf-8')))
238
239     def initdb(self):
240         index = Index(database=self.database, index_path=self.index_dir)
241         try:
242             index.metadata.create_table()
243         except DatabaseError as e:
244             print('error: {}'.format(str(e)))
245             
246     def run(self):
247         if self.sub_command == 'query':
248             return self.query()
249
250         if self.sub_command == 'index':
251             return self.index()
252
253         if self.sub_command == 'getdocument':
254             return self.getdocument()
255
256         if self.sub_command == 'initdb':
257             return self.initdb()
258
259         if self.sub_command == 'analyze':
260             return self.analyze()
261
262         if self.sub_command == 'metadata':
263             return self.metadata()
264
265         return self.show_help()
266
267
268 if __name__ == '__main__':
269     cli = SearchCLI()
270     cli.run()