ns_search: implement comment indexer

[newslash/newslash.git] / src / ns_search / newslash_index / index.py
diff --git a/src/ns_search/newslash_index/index.py b/src/ns_search/newslash_index/index.py

index 27945fc..4c6e5a6 100644 (file)
--- a/src/ns_search/newslash_index/index.py
+++ b/src/ns_search/newslash_index/index.py
@@ -44,41 +44,58 @@ class Index(object):
              raise ConfigFileError(ConfigFileError.SECTION_NOT_FOUND, "Database")
          return db_cfg
  
+    def delete(self, target, unique_id, indexer=None):
+        '''delete document from index'''
+        # create query to identify target document
+        q = lucene_wrapper.BooleanQuery()
+        q.add_must(lucene_wrapper.TermQuery("type", target))
+        q.add_must(lucene_wrapper.IntRangeQuery("unique_id", unique_id))
+        
+        # check if deletable
+        #searcher = self._get_searcher()
+        #result = searcher.search(q)
+
+        # delete document
+        if indexer: 
+            indexer.delete(q)
+        else:
+            with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
+                indexer.delete(q)
+
      def delete_all(self):
          '''delete all document from index'''
          with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
              indexer.delete_all()
  
-    def update(self, target, batch_size=1000, progress_cb=None, error_cb=None):
-        '''update story index'''
-        stories = newslash_db.Stories(self._db_config())
+    def _update(self, selector, target, batch_size=1000, progress_cb=None, error_cb=None):
          query_done = False
  
          # at first, get last indexed id  and timestamp
-        stories_data = self.metadata.get('stories')
-        latest_id = stories_data.get('latest_id', 0)
-        last_update = stories_data.get('last_update')
+        metadata = self.metadata.get(target)
+        latest_id = metadata.get('latest_id', 0)
+        last_update = metadata.get('last_update')
  
-        # add new stories to index
-        start_update = datetime.now()
+        # add document to index
+        # don't use datetime.now(), because Database server's timestamp may be not this server's one.
+        start_update = self.metadata.get_current_timestamp()
          add_success = 0
          add_errors = 0
          offset = 0
-        max_stoid = 0
+        max_unique_id = 0
          with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
              while not query_done:
                  # repeat process
-                items = stories.select(limit=batch_size, offset=offset, stoid_gt=latest_id)
+                items = selector(batch_size, offset, latest_id)
                  offset += len(items)
                  if len(items) < batch_size:
                      query_done = True
  
                  for item in items:
-                    if item["neverdisplay"] == "1":
+                    if target == "story" and item["neverdisplay"] == "1":
                          add_success += 1
                          continue
                      try:
-                        doc = self._make_story_document(item)
+                        doc = self._make_document(target, item)
                      except exceptions.DocumentMakingError:
                          add_errors += 1
                          if error_cb is not None:
@@ -90,33 +107,57 @@ class Index(object):
  
                  if progress_cb is not None:
                      progress_cb('add', add_success, add_errors)
-
+                    
                  for item in items:
-                    if item["stoid"] > max_stoid:
-                        max_stoid = item["stoid"];
+                    unique_id = self._get_unique_id(target, item)
+                    if unique_id > max_unique_id:
+                        max_unique_id = unique_id;
+
+        # update metadata
+        self.metadata.update(target=target, last_update=start_update, latest_id=max_unique_id)
+
+        # if no previous update, done
+        if last_update is None:
+            return (add_success, add_errors)
  
          # update index for updated stories
          update_success = 0
          update_errors = 0
+        offset = 0
          with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
-            items = stories.select(stoid_le=latest_id, last_update_ge=last_update)
-            for item in items:
-                # first, create term to identify target document
-                target_id = self._get_primary_id(target, item)
-                term = lucene_wrapper.BooleanQuery()
-                term.add_must(lucene_wrapper.TermQuery("type", target))
-                term.add_must(lucene_wrapper.TermQuery("id", target_id))
+            # repeat process
+            items = selector(batch_size, offset, latest_id, last_update)
+            offset += len(items)
+            if len(items) < batch_size:
+                query_done = True
  
+            for item in items:
+                # at first, create new document
                  try:
-                    doc = self._make_story_document(item)
+                    doc = self._make_document(target, item)
                  except exceptions.DocumentMakingError:
                      update_errors += 1
                      if error_cb is not None:
                          error_cb('update', item)
                      continue
-                indexer.delete(term)
-                if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+
+                # create query to identify target document
+                target_id = self._get_unique_id(target, item)
+                self.delete(target, target_id, indexer) 
+                #q = lucene_wrapper.BooleanQuery()
+                #q.add_must(lucene_wrapper.TermQuery("type", target))
+                #q.add_must(lucene_wrapper.IntRangeQuery("unique_id", target_id))
+
+                # delete document
+                #indexer.delete(q)
+
+                # add document
+                if target == "story":
+                    if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+                        indexer.add(doc)
+                else:
                      indexer.add(doc)
+
                  update_success += 1
  
          if progress_cb is not None:
@@ -125,53 +166,60 @@ class Index(object):
          success = add_success + update_success
          errors = add_errors + update_errors
  
-        # update metadata
-        self.metadata.update(target='stories', last_update=start_update, latest_id=max_stoid)
-
          # done
          return (success, errors)
  
-    def _get_primary_id(self, target, item):
-        if target == 'stories' or target == 'story':
-            return item['sid']
-
-        return None
-    
-    def update_all_stories(self, batch_size=1000, progress_cb=None, error_cb=None):
-        '''update index for all stories'''
+    def update_story(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update story index'''
          stories = newslash_db.Stories(self._db_config())
-        query_done = False
+        def selector(limit, offset, latest_id, last_update=None):
+            if last_update:
+                return stories.select(limit=limit, stoid_le=latest_id, last_update_ge=last_update)
+            else:
+                return stories.select(limit=limit, offset=offset, stoid_gt=latest_id)
+
+        return self._update(selector, 'story', batch_size, progress_cb, error_cb)
+            
+    def update_comment(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update comment index'''
+        comments = newslash_db.Comments(self._db_config())
+        def selector(limit, offset, latest_id, last_update=None):
+            if last_update:
+                return comments.select(limit=limit, cid_le=latest_id, last_update_ge=last_update)
+            else:
+                return comments.select(limit=limit, offset=offset, cid_gt=latest_id)
+
+        return self._update(selector, 'comment', batch_size, progress_cb, error_cb)
+
+    def update(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update index'''
+        success, errors = self.update_story(batch_size, progress_cb, error_cb)
  
-        success = 0
-        errors = 0
-        offset = 0
-        with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
-            while not query_done:
-                # repeat process for each 1000 items
-                items = stories.select(limit=batch_size, offset=offset)
-                offset += len(items)
-                if len(items) < batch_size:
-                    query_done = True
+        # done
+        return (success, errors)
  
-                for item in items:
-                    try:
-                        doc = self._make_story_document(item)
-                    except exceptions.DocumentMakingError:
-                        errors += 1
-                        if error_cb is not None:
-                            error_cb('add', item)
-                        continue
-                    indexer.add(doc)
-                    success += 1
+    def _get_unique_id(self, target, item):
+        if target == 'story':
+            return item['stoid']
+        if target == 'comment':
+            return item['cid']
  
-                if progress_cb is not None:
-                    progress_cb('add', success, errors)
+        sys.stderr.write("_get_unique_id - invalid target: {}".format(target))
+        return None
  
-        return (success, errors)
+    def _make_document(self, target, item):
+        if target == 'story':
+            return self._make_story_document(item)
+        elif target == 'comment':
+            return self._make_comment_document(item)
  
+        sys.stderr.write("_make_document - invalid target: {}".format(target))
+        return None
+    
      def _make_story_document(self, item):
-        '''make Document object from query result'''
+        '''make Document object from story object'''
          doc = lucene_wrapper.Document()
+        # some item have invalid time data
          if item["time"] is None:
              raise exceptions.DocumentMakingError()
  
@@ -185,7 +233,7 @@ class Index(object):
          (content_text, urls) = htmlutil.strip_html_tag(introtext + bodytext)
  
          doc.add_string_field("type", "story")
-        doc.add_string_field("unique_id", str(item["stoid"]))
+        doc.add_int_field("unique_id", item["stoid"])
          doc.add_string_field("id", item["sid"])
  
          doc.add_text_field("title", item["title"])
@@ -203,6 +251,35 @@ class Index(object):
  
          return doc
      
+    def _make_comment_document(self, item):
+        '''make Document object from comment object'''
+        doc = lucene_wrapper.Document()
+
+        # convert datetime to UNIX timestamp
+        timestamp = calendar.timegm(item["date"].utctimetuple())
+        last_update = calendar.timegm(item["last_update"].utctimetuple())
+
+        # prepare intro-/body-text, url
+        (content_text, urls) = htmlutil.strip_html_tag(item["comment"])
+
+        doc.add_string_field("type", "comment")
+        doc.add_int_field("unique_id", item["cid"])
+        doc.add_string_field("id", str(item["cid"]))
+
+        doc.add_text_field("title", item["subject"])
+        doc.add_text_field("content_text", content_text)
+
+        doc.add_int_field("create_time", timestamp)
+        doc.add_int_field("last_update", last_update)
+        doc.add_int_field("author", item["uid"])
+        doc.add_int_field("points", item["points"])
+        
+
+        for url in urls:
+            doc.add_string_field("url", url)
+
+        return doc
+    
      def get(self, item_type, item_id):
          '''get document match iten_type and item_id from index'''
          searcher = self._get_searcher()