OSDN Git Service

ns_search: change search index structure - store metadata to file
[newslash/newslash.git] / src / ns_search / searchd.py
1 # -*- coding: utf-8
2 ''' searchd: Search daemon for Newslash'''
3
4 import re
5 import sys
6 import os
7 import os.path
8
9 from yaml import load
10 try:
11     from yaml import CLoader as Loader, CDumper as Dumper
12 except ImportError:
13     from yaml import Loader, Dumper
14
15 from newslash_searchd import Router, Route
16 import lucene_wrapper
17
18 lucene_wrapper.init_vm()
19
20 class SearchdError(Exception):
21     def __init__(self, message):
22         self.message = message
23
24
25 # load config file
26 def _load_config(pathname):
27     try:
28         fh = open(pathname)
29     except IOError:
30         raise SearchdError("config file not found")
31
32     config = load(fh, Loader=Loader)
33     fh.close()
34
35     # convert relative index_path to absolute path
36     section = config.get('Searchd')
37     if section: 
38         index_path = section.get('index_path', '')
39         if index_path[0] != '/':
40             cfg_dir = os.path.dirname(pathname)
41             index_path = os.path.normpath(os.path.join(cfg_dir, index_path))
42             section["index_path"] = index_path
43         
44     return config
45
46 config_path = os.environ.get("SEARCHD_CONFIG", "/etc/newslash/searchd.conf")
47 if not os.path.exists(config_path):
48     base_dir = os.path.dirname(os.path.realpath(__file__))
49     config_path = os.path.join(base_dir, 'searchd.conf')
50     if not os.path.exists(config_path):
51         raise SearchdError("config file not found")
52
53 config = _load_config(config_path)
54
55
56 class Searchd(Router):
57     def __init__(self, environ, start_response):
58         super(Searchd, self).__init__(environ, start_response)
59         self.default_route(Root())
60         self.route(re.compile(r'^/admin'), SearchdAdmin())
61
62
63 class Root(Route):
64     def __init__(self):
65         self._config = config
66         
67     def config(self, section, key=None, default=None):
68         if key is None:
69             return self._config.get(section, default)
70         d = self._config.get(section, {})
71         return d.get(key, default)
72
73     def get(self, req, resp):
74         resp.render(200, json={"error": 0})
75
76     def html_escape(self, text):
77         text = text.replace("&", "&")
78         text = text.replace("<", "&lt;")
79         text = text.replace(">", "&gt;")
80         return text;
81         
82     def post(self, req, resp):
83         # check request body is valid
84         if req.body is None:
85             resp.render(400)
86             return
87
88         # start query
89         query_text = req.body.get('query', '')
90         limit = req.body.get('limit', 10)
91         target = req.body.get('target', 'story')
92         sort_key = req.body.get('sort_key', 'create_time')
93         sort_reverse = req.body.get('sort_reverse', 1) == 1
94
95         try:
96             offset = int(req.body.get('offset', 0))
97         except:
98             offset = 0
99
100         # log
101         #if req.environ['wsgi.errors']:
102         #req.environ['wsgi.errors'].write(query.encode('utf-8'))
103
104         index_dir = os.path.join(self.config('Searchd', 'index_path'), 'lucene_index')
105         searcher = lucene_wrapper.Searcher(index_directory=index_dir)
106         try:
107             content_query = lucene_wrapper.Query("content_text", query_text)
108             title_query = lucene_wrapper.Query("title", query_text)
109             query = lucene_wrapper.BooleanQuery()
110             query.set_minimum_nubmber_should_match(1)
111             query.add_should(content_query)
112             query.add_should(title_query)
113
114             if target != 'all':
115                 target_query = lucene_wrapper.TermQuery("type", target)
116                 query.add_must(target_query)
117         except lucene_wrapper.QueryParseError as e:
118             resp.render(400, json={"error": { "message": e.message }})
119             searcher.close()
120             return
121         except Exception as e:
122             resp.render(500, json={"error": { "message": "query_error" }})
123             req.environ['wsgi.errors'].write("query error: {} - query is {} ".format(e, query_text))
124             searcher.close()
125             return
126
127         try:
128             sort = lucene_wrapper.Sort(sort_key, lucene_wrapper.Sort.INT, sort_reverse)
129             result = searcher.search(query, limit, offset, sort)
130         except Exception as e:
131             resp.render(500, json={"error": { "message": "search_error" }})
132             req.environ['wsgi.errors'].write("search error: {} - query is {} \n".format(e, query_text))
133             searcher.close()
134             return
135
136         resp_body = {
137             "total_hits": result.total_hits,
138             "hits": [],
139             "start": offset,
140         }
141
142         # for highlighting
143         highlighter = lucene_wrapper.Highlighter(query, "<strong>", "</strong>")
144
145         for item in result:
146             texts = highlighter.get_best_fragments("content_text", self.html_escape(item.content_text), 2)
147             content_text = "".join(texts)
148             if content_text is None or len(texts) == 0:
149                 content_text = item.content_text
150
151             title = highlighter.get_best_fragment("title", item.title)
152             if title is None or len(title) == 0:
153                 title = item.title
154
155             resp_body["hits"].append({ "number": item.number,
156                                        "type": item.type,
157                                        "id": item.id,
158                                        "title": title,
159                                        "author": item.author,
160                                        "create_time": item.create_time,
161                                        "content_text": content_text,
162             })
163
164         # done
165         searcher.close()
166         resp.render(200, json=resp_body)
167     
168
169 class SearchdAdmin(Root):
170     def get(self, req, resp):
171         resp.render(200, json={"error": 0})
172         
173     def post(self, req, resp):
174         pass
175
176
177 if __name__ == '__main__':
178     from wsgiref.simple_server import make_server, WSGIRequestHandler
179
180     host = config.get('host', "")
181     port = config.get('port', 6000)
182     if len(host) > 0 and host.find(":") >= 0:
183         (host, port) = host.split(":", 1)
184
185     server = make_server(host, port, Searchd)
186
187     print("starting server at {}:{}...".format(host, port))
188     server.serve_forever()