OSDN Git Service

ns_search: implement comment indexer
[newslash/newslash.git] / src / ns_search / newslash_index / index.py
index 27945fc..4c6e5a6 100644 (file)
@@ -44,41 +44,58 @@ class Index(object):
             raise ConfigFileError(ConfigFileError.SECTION_NOT_FOUND, "Database")
         return db_cfg
 
+    def delete(self, target, unique_id, indexer=None):
+        '''delete document from index'''
+        # create query to identify target document
+        q = lucene_wrapper.BooleanQuery()
+        q.add_must(lucene_wrapper.TermQuery("type", target))
+        q.add_must(lucene_wrapper.IntRangeQuery("unique_id", unique_id))
+        
+        # check if deletable
+        #searcher = self._get_searcher()
+        #result = searcher.search(q)
+
+        # delete document
+        if indexer: 
+            indexer.delete(q)
+        else:
+            with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
+                indexer.delete(q)
+
     def delete_all(self):
         '''delete all document from index'''
         with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
             indexer.delete_all()
 
-    def update(self, target, batch_size=1000, progress_cb=None, error_cb=None):
-        '''update story index'''
-        stories = newslash_db.Stories(self._db_config())
+    def _update(self, selector, target, batch_size=1000, progress_cb=None, error_cb=None):
         query_done = False
 
         # at first, get last indexed id  and timestamp
-        stories_data = self.metadata.get('stories')
-        latest_id = stories_data.get('latest_id', 0)
-        last_update = stories_data.get('last_update')
+        metadata = self.metadata.get(target)
+        latest_id = metadata.get('latest_id', 0)
+        last_update = metadata.get('last_update')
 
-        # add new stories to index
-        start_update = datetime.now()
+        # add document to index
+        # don't use datetime.now(), because Database server's timestamp may be not this server's one.
+        start_update = self.metadata.get_current_timestamp()
         add_success = 0
         add_errors = 0
         offset = 0
-        max_stoid = 0
+        max_unique_id = 0
         with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
             while not query_done:
                 # repeat process
-                items = stories.select(limit=batch_size, offset=offset, stoid_gt=latest_id)
+                items = selector(batch_size, offset, latest_id)
                 offset += len(items)
                 if len(items) < batch_size:
                     query_done = True
 
                 for item in items:
-                    if item["neverdisplay"] == "1":
+                    if target == "story" and item["neverdisplay"] == "1":
                         add_success += 1
                         continue
                     try:
-                        doc = self._make_story_document(item)
+                        doc = self._make_document(target, item)
                     except exceptions.DocumentMakingError:
                         add_errors += 1
                         if error_cb is not None:
@@ -90,33 +107,57 @@ class Index(object):
 
                 if progress_cb is not None:
                     progress_cb('add', add_success, add_errors)
-
+                    
                 for item in items:
-                    if item["stoid"] > max_stoid:
-                        max_stoid = item["stoid"];
+                    unique_id = self._get_unique_id(target, item)
+                    if unique_id > max_unique_id:
+                        max_unique_id = unique_id;
+
+        # update metadata
+        self.metadata.update(target=target, last_update=start_update, latest_id=max_unique_id)
+
+        # if no previous update, done
+        if last_update is None:
+            return (add_success, add_errors)
 
         # update index for updated stories
         update_success = 0
         update_errors = 0
+        offset = 0
         with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
-            items = stories.select(stoid_le=latest_id, last_update_ge=last_update)
-            for item in items:
-                # first, create term to identify target document
-                target_id = self._get_primary_id(target, item)
-                term = lucene_wrapper.BooleanQuery()
-                term.add_must(lucene_wrapper.TermQuery("type", target))
-                term.add_must(lucene_wrapper.TermQuery("id", target_id))
+            # repeat process
+            items = selector(batch_size, offset, latest_id, last_update)
+            offset += len(items)
+            if len(items) < batch_size:
+                query_done = True
 
+            for item in items:
+                # at first, create new document
                 try:
-                    doc = self._make_story_document(item)
+                    doc = self._make_document(target, item)
                 except exceptions.DocumentMakingError:
                     update_errors += 1
                     if error_cb is not None:
                         error_cb('update', item)
                     continue
-                indexer.delete(term)
-                if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+
+                # create query to identify target document
+                target_id = self._get_unique_id(target, item)
+                self.delete(target, target_id, indexer) 
+                #q = lucene_wrapper.BooleanQuery()
+                #q.add_must(lucene_wrapper.TermQuery("type", target))
+                #q.add_must(lucene_wrapper.IntRangeQuery("unique_id", target_id))
+
+                # delete document
+                #indexer.delete(q)
+
+                # add document
+                if target == "story":
+                    if item["neverdisplay"] != "1" and item["time"] <= datetime.now():
+                        indexer.add(doc)
+                else:
                     indexer.add(doc)
+
                 update_success += 1
 
         if progress_cb is not None:
@@ -125,53 +166,60 @@ class Index(object):
         success = add_success + update_success
         errors = add_errors + update_errors
 
-        # update metadata
-        self.metadata.update(target='stories', last_update=start_update, latest_id=max_stoid)
-
         # done
         return (success, errors)
 
-    def _get_primary_id(self, target, item):
-        if target == 'stories' or target == 'story':
-            return item['sid']
-
-        return None
-    
-    def update_all_stories(self, batch_size=1000, progress_cb=None, error_cb=None):
-        '''update index for all stories'''
+    def update_story(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update story index'''
         stories = newslash_db.Stories(self._db_config())
-        query_done = False
+        def selector(limit, offset, latest_id, last_update=None):
+            if last_update:
+                return stories.select(limit=limit, stoid_le=latest_id, last_update_ge=last_update)
+            else:
+                return stories.select(limit=limit, offset=offset, stoid_gt=latest_id)
+
+        return self._update(selector, 'story', batch_size, progress_cb, error_cb)
+            
+    def update_comment(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update comment index'''
+        comments = newslash_db.Comments(self._db_config())
+        def selector(limit, offset, latest_id, last_update=None):
+            if last_update:
+                return comments.select(limit=limit, cid_le=latest_id, last_update_ge=last_update)
+            else:
+                return comments.select(limit=limit, offset=offset, cid_gt=latest_id)
+
+        return self._update(selector, 'comment', batch_size, progress_cb, error_cb)
+
+    def update(self, batch_size=1000, progress_cb=None, error_cb=None):
+        '''update index'''
+        success, errors = self.update_story(batch_size, progress_cb, error_cb)
 
-        success = 0
-        errors = 0
-        offset = 0
-        with lucene_wrapper.Indexer(index_directory=self.config("SearchIndex", "path")) as indexer:
-            while not query_done:
-                # repeat process for each 1000 items
-                items = stories.select(limit=batch_size, offset=offset)
-                offset += len(items)
-                if len(items) < batch_size:
-                    query_done = True
+        # done
+        return (success, errors)
 
-                for item in items:
-                    try:
-                        doc = self._make_story_document(item)
-                    except exceptions.DocumentMakingError:
-                        errors += 1
-                        if error_cb is not None:
-                            error_cb('add', item)
-                        continue
-                    indexer.add(doc)
-                    success += 1
+    def _get_unique_id(self, target, item):
+        if target == 'story':
+            return item['stoid']
+        if target == 'comment':
+            return item['cid']
 
-                if progress_cb is not None:
-                    progress_cb('add', success, errors)
+        sys.stderr.write("_get_unique_id - invalid target: {}".format(target))
+        return None
 
-        return (success, errors)
+    def _make_document(self, target, item):
+        if target == 'story':
+            return self._make_story_document(item)
+        elif target == 'comment':
+            return self._make_comment_document(item)
 
+        sys.stderr.write("_make_document - invalid target: {}".format(target))
+        return None
+    
     def _make_story_document(self, item):
-        '''make Document object from query result'''
+        '''make Document object from story object'''
         doc = lucene_wrapper.Document()
+        # some item have invalid time data
         if item["time"] is None:
             raise exceptions.DocumentMakingError()
 
@@ -185,7 +233,7 @@ class Index(object):
         (content_text, urls) = htmlutil.strip_html_tag(introtext + bodytext)
 
         doc.add_string_field("type", "story")
-        doc.add_string_field("unique_id", str(item["stoid"]))
+        doc.add_int_field("unique_id", item["stoid"])
         doc.add_string_field("id", item["sid"])
 
         doc.add_text_field("title", item["title"])
@@ -203,6 +251,35 @@ class Index(object):
 
         return doc
     
+    def _make_comment_document(self, item):
+        '''make Document object from comment object'''
+        doc = lucene_wrapper.Document()
+
+        # convert datetime to UNIX timestamp
+        timestamp = calendar.timegm(item["date"].utctimetuple())
+        last_update = calendar.timegm(item["last_update"].utctimetuple())
+
+        # prepare intro-/body-text, url
+        (content_text, urls) = htmlutil.strip_html_tag(item["comment"])
+
+        doc.add_string_field("type", "comment")
+        doc.add_int_field("unique_id", item["cid"])
+        doc.add_string_field("id", str(item["cid"]))
+
+        doc.add_text_field("title", item["subject"])
+        doc.add_text_field("content_text", content_text)
+
+        doc.add_int_field("create_time", timestamp)
+        doc.add_int_field("last_update", last_update)
+        doc.add_int_field("author", item["uid"])
+        doc.add_int_field("points", item["points"])
+        
+
+        for url in urls:
+            doc.add_string_field("url", url)
+
+        return doc
+    
     def get(self, item_type, item_id):
         '''get document match iten_type and item_id from index'''
         searcher = self._get_searcher()