raise ConfigFileError(ConfigFileError.SECTION_NOT_FOUND, "Database")
return db_cfg
+ def delete(self, target, unique_id, indexer=None):
+ '''delete document from index'''
+ # create query to identify target document
+ q = lucene_wrapper.BooleanQuery()
+ q.add_must(lucene_wrapper.TermQuery("type", target))
+ q.add_must(lucene_wrapper.IntRangeQuery("unique_id", unique_id))
+
+ # check if deletable
+ #searcher = self._get_searcher()
+ #result = searcher.search(q)
+
+ # delete document
+ if indexer:
+ indexer.delete(q)
+ else:
+ with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
+ indexer.delete(q)
+
def delete_all(self):
'''delete all document from index'''
with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
indexer.delete_all()
- def update(self, target, batch_size=1000, progress_cb=None, error_cb=None):
- '''update story index'''
- stories = newslash_db.Stories(self._db_config())
+ def _update(self, selector, target, batch_size=1000, progress_cb=None, error_cb=None):
query_done = False
# at first, get last indexed id and timestamp
- stories_data = self.metadata.get('stories')
- latest_id = stories_data.get('latest_id', 0)
- last_update = stories_data.get('last_update')
+ metadata = self.metadata.get(target)
+ latest_id = metadata.get('latest_id', 0)
+ last_update = metadata.get('last_update')
- # add new stories to index
- start_update = datetime.now()
+ # add document to index
+ # don't use datetime.now(), because Database server's timestamp may be not this server's one.
+ start_update = self.metadata.get_current_timestamp()
add_success = 0
add_errors = 0
offset = 0
- max_stoid = 0
+ max_unique_id = 0
with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
while not query_done:
# repeat process
- items = stories.select(limit=batch_size, offset=offset, stoid_gt=latest_id)
+ items = selector(batch_size, offset, latest_id)
offset += len(items)
if len(items) < batch_size:
query_done = True
for item in items:
- if item["neverdisplay"] == "1":
+ if target == "story" and item["neverdisplay"] == "1":
add_success += 1
continue
try:
- doc = self._make_story_document(item)
+ doc = self._make_document(target, item)
except exceptions.DocumentMakingError:
add_errors += 1
if error_cb is not None:
if progress_cb is not None:
progress_cb('add', add_success, add_errors)
-
+
for item in items:
- if item["stoid"] > max_stoid:
- max_stoid = item["stoid"];
+ unique_id = self._get_unique_id(target, item)
+ if unique_id > max_unique_id:
+ max_unique_id = unique_id;
+
+ # update metadata
+ self.metadata.update(target=target, last_update=start_update, latest_id=max_unique_id)
+
+ # if no previous update, done
+ if last_update is None:
+ return (add_success, add_errors)
# update index for updated stories
update_success = 0
update_errors = 0
+ offset = 0
with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
- items = stories.select(stoid_le=latest_id, last_update_ge=last_update)
- for item in items:
- # first, create term to identify target document
- target_id = self._get_primary_id(target, item)
- term = lucene_wrapper.BooleanQuery()
- term.add_must(lucene_wrapper.TermQuery("type", target))
- term.add_must(lucene_wrapper.TermQuery("id", target_id))
+ # repeat process
+ items = selector(batch_size, offset, latest_id, last_update)
+ offset += len(items)
+ if len(items) < batch_size:
+ query_done = True
+ for item in items:
+ # at first, create new document
try:
- doc = self._make_story_document(item)
+ doc = self._make_document(target, item)
except exceptions.DocumentMakingError:
update_errors += 1
if error_cb is not None:
error_cb('update', item)
continue
- indexer.delete(term)
- if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+
+ # create query to identify target document
+ target_id = self._get_unique_id(target, item)
+ self.delete(target, target_id, indexer)
+ #q = lucene_wrapper.BooleanQuery()
+ #q.add_must(lucene_wrapper.TermQuery("type", target))
+ #q.add_must(lucene_wrapper.IntRangeQuery("unique_id", target_id))
+
+ # delete document
+ #indexer.delete(q)
+
+ # add document
+ if target == "story":
+ if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+ indexer.add(doc)
+ else:
indexer.add(doc)
+
update_success += 1
if progress_cb is not None:
success = add_success + update_success
errors = add_errors + update_errors
- # update metadata
- self.metadata.update(target='stories', last_update=start_update, latest_id=max_stoid)
-
# done
return (success, errors)
- def _get_primary_id(self, target, item):
- if target == 'stories' or target == 'story':
- return item['sid']
-
- return None
-
- def update_all_stories(self, batch_size=1000, progress_cb=None, error_cb=None):
- '''update index for all stories'''
+ def update_story(self, batch_size=1000, progress_cb=None, error_cb=None):
+ '''update story index'''
stories = newslash_db.Stories(self._db_config())
- query_done = False
+ def selector(limit, offset, latest_id, last_update=None):
+ if last_update:
+ return stories.select(limit=limit, stoid_le=latest_id, last_update_ge=last_update)
+ else:
+ return stories.select(limit=limit, offset=offset, stoid_gt=latest_id)
+
+ return self._update(selector, 'story', batch_size, progress_cb, error_cb)
+
+ def update_comment(self, batch_size=1000, progress_cb=None, error_cb=None):
+ '''update comment index'''
+ comments = newslash_db.Comments(self._db_config())
+ def selector(limit, offset, latest_id, last_update=None):
+ if last_update:
+ return comments.select(limit=limit, cid_le=latest_id, last_update_ge=last_update)
+ else:
+ return comments.select(limit=limit, offset=offset, cid_gt=latest_id)
+
+ return self._update(selector, 'comment', batch_size, progress_cb, error_cb)
+
+ def update(self, batch_size=1000, progress_cb=None, error_cb=None):
+ '''update index'''
+ success, errors = self.update_story(batch_size, progress_cb, error_cb)
- success = 0
- errors = 0
- offset = 0
- with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
- while not query_done:
- # repeat process for each 1000 items
- items = stories.select(limit=batch_size, offset=offset)
- offset += len(items)
- if len(items) < batch_size:
- query_done = True
+ # done
+ return (success, errors)
- for item in items:
- try:
- doc = self._make_story_document(item)
- except exceptions.DocumentMakingError:
- errors += 1
- if error_cb is not None:
- error_cb('add', item)
- continue
- indexer.add(doc)
- success += 1
+ def _get_unique_id(self, target, item):
+ if target == 'story':
+ return item['stoid']
+ if target == 'comment':
+ return item['cid']
- if progress_cb is not None:
- progress_cb('add', success, errors)
+ sys.stderr.write("_get_unique_id - invalid target: {}".format(target))
+ return None
- return (success, errors)
+ def _make_document(self, target, item):
+ if target == 'story':
+ return self._make_story_document(item)
+ elif target == 'comment':
+ return self._make_comment_document(item)
+ sys.stderr.write("_make_document - invalid target: {}".format(target))
+ return None
+
def _make_story_document(self, item):
- '''make Document object from query result'''
+ '''make Document object from story object'''
doc = lucene_wrapper.Document()
+ # some item have invalid time data
if item["time"] is None:
raise exceptions.DocumentMakingError()
(content_text, urls) = htmlutil.strip_html_tag(introtext + bodytext)
doc.add_string_field("type", "story")
- doc.add_string_field("unique_id", str(item["stoid"]))
+ doc.add_int_field("unique_id", item["stoid"])
doc.add_string_field("id", item["sid"])
doc.add_text_field("title", item["title"])
return doc
+ def _make_comment_document(self, item):
+ '''make Document object from comment object'''
+ doc = lucene_wrapper.Document()
+
+ # convert datetime to UNIX timestamp
+ timestamp = calendar.timegm(item["date"].utctimetuple())
+ last_update = calendar.timegm(item["last_update"].utctimetuple())
+
+ # prepare intro-/body-text, url
+ (content_text, urls) = htmlutil.strip_html_tag(item["comment"])
+
+ doc.add_string_field("type", "comment")
+ doc.add_int_field("unique_id", item["cid"])
+ doc.add_string_field("id", str(item["cid"]))
+
+ doc.add_text_field("title", item["subject"])
+ doc.add_text_field("content_text", content_text)
+
+ doc.add_int_field("create_time", timestamp)
+ doc.add_int_field("last_update", last_update)
+ doc.add_int_field("author", item["uid"])
+ doc.add_int_field("points", item["points"])
+
+
+ for url in urls:
+ doc.add_string_field("url", url)
+
+ return doc
+
def get(self, item_type, item_id):
'''get document match iten_type and item_id from index'''
searcher = self._get_searcher()