OSDN Git Service

ns_search: change search index structure - store metadata to file
authorhylom <hylom@users.osdn.me>
Thu, 6 Jun 2019 16:25:16 +0000 (16:25 +0000)
committerhylom <hylom@users.osdn.me>
Thu, 6 Jun 2019 16:25:16 +0000 (16:25 +0000)
src/ns_search/newslash_index/exceptions.py
src/ns_search/newslash_index/index.py
src/ns_search/newslash_index/metadata.py
src/ns_search/searchd.py
src/ns_search/searchd_cli.py
test-container/ns-searchd/Makefile

index 9a9222d..4977d7d 100644 (file)
@@ -8,6 +8,7 @@ class NewslashIndexError(Exception):
 
 class ConfigFileError(NewslashIndexError):
     SECTION_NOT_FOUND = "Section not found"
+    PARAMETER_NOT_FOUND = "Parameter not found"
     def __init__(self, reason, section=""):
         self.message = "Config Error - {}: {}".format(reason, section)
 
index 38e5241..2fbaa96 100644 (file)
@@ -3,6 +3,7 @@
 
 import calendar
 from datetime import datetime
+import os.path
 
 import exceptions
 import lucene_wrapper
@@ -24,9 +25,12 @@ class Index(object):
             self.config[k] = kwargs[k]
 
         self.metadata = Metadata(self.config)
+        p = self.config.get("index_path")
+        self._index_path = os.path.join(p, "lucene_index");
+
 
     def _get_searcher(self):
-        return lucene_wrapper.Searcher(index_directory=self.config.get("index_path"))
+        return lucene_wrapper.Searcher(index_directory=self._index_path)
 
     def _db_config(self):
         db_cfg = self.config.get("database")
@@ -49,12 +53,12 @@ class Index(object):
         if indexer: 
             indexer.delete(q)
         else:
-            with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer:
+            with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer:
                 indexer.delete(q)
 
     def delete_all(self):
         '''delete all document from index'''
-        with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer:
+        with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer:
             indexer.delete_all()
 
     def _get_unique_id(self, target, item):
@@ -88,7 +92,7 @@ class Index(object):
         max_unique_id = latest_id
 
         force_exit = False
-        with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer:
+        with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer:
             try:
                 while not query_done:
                     items = selector(batch_size, 0, max_unique_id)
@@ -148,7 +152,7 @@ class Index(object):
         update_success = 0
         update_errors = 0
         offset = 0
-        with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer:
+        with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer:
             items = selector(batch_size, offset, latest_id, last_update)
             offset += len(items)
             if len(items) < batch_size:
index 470bcd2..7153ebf 100644 (file)
@@ -2,57 +2,99 @@
 """metadata: index metadata for newslash search system"""
 
 import newslash_db
-from mysql.connector.errors import ProgrammingError
+import sqlite3
+import os.path
+#from mysql.connector.errors import ProgrammingError
 import exceptions
 
+class SQLiteDB(object):
+    def __init__(self, path):
+        self._path = path
+        self._conn = None
+
+    def connect(self):
+        self._conn = sqlite3.connect(self._path);
+        #self._conn.autocommit = True
+        return self
+
+    def cursor(self):
+        if not self._conn:
+            self.connect()
+        cur = self._conn.cursor()
+        return cur
+
+    def execute(self, query, **kwargs):
+        cur = self.cursor()
+        cur.execute(query, kwargs)
+        return cur
+
+    def commit(self):
+        self._conn.commit()
+        return self
+
+    def rollback(self):
+        self._conn.rollback()
+        return self
+
+    def close(self):
+        if self._conn:
+            self.commit()
+            self._conn.close()
+            self._conn = None
+        return self
+
+
 class Metadata(object):
     def __init__(self, config):
         self._config = config
+        path = config.get("index_path")
+        if not path:
+            raise exceptions.ConfigFileError(exceptions.ConfigFileError.PARAMETER_NOT_FOUND, "index_path")
+            
+        self._db_path = os.path.join(path, "_ns_metadata.db");
 
-    def _db_config(self):
-        db_cfg = self._config.get("database")
-        if db_cfg is None:
-            raise exceptions.ConfigFileError(exceptions.ConfigFileError.SECTION_NOT_FOUND, "Database")
-        return db_cfg
-
+    def _get_db(self):
+        return SQLiteDB(self._db_path);
+    
     def get_current_timestamp(self):
-        db = newslash_db.NewslashDB(self._db_config())
-        cur = db.execute('SELECT NOW()')
-        return cur.fetchone()[0]
+        db = self._get_db()
+        cur = db.execute("SELECT datetime('now')")
+        r = cur.fetchone()[0]
+        db.close()
+        return r
 
     def create_table(self):
         sql = (
             "CREATE TABLE ns_search_metadata ("
-            "  target_id    tinyint(8) unsigned NOT NULL AUTO_INCREMENT,"
-            "  target_name  varchar(32)         NOT NULL UNIQUE,"
-            "  last_update  timestamp           NOT NULL DEFAULT CURRENT_TIMESTAMP,"
-            "  latest_id    int(8) unsigned     NOT NULL DEFAULT 0,"
-            "  PRIMARY KEY (target_id)"
+            "  target_id    INTEGER PRIMARY KEY AUTOINCREMENT,"
+            "  target_name  TEXT    NOT NULL UNIQUE,"
+            "  last_update  TEXT    NOT NULL DEFAULT CURRENT_TIMESTAMP,"
+            "  latest_id    INTEGER NOT NULL DEFAULT 0"
             ")"
         )
 
-        db= newslash_db.NewslashDB(self._db_config())
+        db = self._get_db()
         try:
             db.execute(sql)
-        except ProgrammingError as e:
+        except sqlite3.OperationalError as e:
             db.close()
             raise exceptions.DatabaseError('table creation error: {}'.format(str(e)))
 
         db.close()
 
     def drop_table(self):
-        db = newslash_db.NewslashDB(self._db_config())
+        db = self._get_db()
         db.execute('DROP TABLE ns_search_metadata')
         db.close()
 
     def delete_all(self):
-        db = newslash_db.NewslashDB(self._db_config())
+        db = self._get_db()
         db.execute('DELETE FROM ns_search_metadata')
         db.close()
 
     def get(self, target):
-        sql = 'SELECT * from ns_search_metadata WHERE target_name = %(target)s'
-        db = newslash_db.NewslashDB(self._db_config())
+        sql = 'SELECT * from ns_search_metadata WHERE target_name = :target'
+        db = self._get_db()
         cur = db.execute(sql, target=target)
         if cur.rowcount > 0:
             result = dict(zip(cur.column_names, cur.fetchone()))
@@ -63,25 +105,56 @@ class Metadata(object):
         return result
 
     def update(self, target, last_update, latest_id):
+        # sqlite 3.24 or after, can use ON CONFLICT, but current package's version
+        # not supported...
+        # if latest_id > 0:
+        #     sql = (
+        #         "INSERT INTO ns_search_metadata"
+        #         "  (target_name, last_update, latest_id)"
+        #         "  VALUES (:target, :last_update, :latest_id)"
+        #         "  ON CONFLICT (target_name) DO UPDATE SET"
+        #         "    last_update = :last_update,"
+        #         "    latest_id = :latest_id"
+        #     )
+        # else:
+        #     sql = (
+        #         "INSERT INTO ns_search_metadata"
+        #         "  (target_name, last_update, latest_id)"
+        #         "  VALUES (:target, :last_update, :latest_id)"
+        #         "  ON CONFLIT (target_name) DO UPDATE SET"
+        #         "    last_update = :last_update"
+        #     )
         if latest_id > 0:
-            sql = (
-                "INSERT INTO ns_search_metadata"
+            update_sql = (
+                "UPDATE OR IGNORE ns_search_metadata"
+                "  SET last_update = :last_update,"
+                "      latest_id = :latest_id"
+                "  WHERE target_name = :target"
+            )
+            insert_sql = (
+                "INSERT OR IGNORE INTO ns_search_metadata"
                 "  (target_name, last_update, latest_id)"
-                "  VALUES (%(target)s, %(last_update)s, %(latest_id)s)"
-                "  ON DUPLICATE KEY UPDATE"
-                "    last_update = %(last_update)s,"
-                "    latest_id = %(latest_id)s"
+                "  VALUES (:target, :last_update, :latest_id)"
             )
         else:
-            sql = (
-                "INSERT INTO ns_search_metadata"
+            update_sql = (
+                "UPDATE OR IGNORE ns_search_metadata"
+                "  SET last_update = :last_update,"
+                "  WHERE target_name = :target"
+            )
+            insert_sql = (
+                "INSERT OR IGNORE INTO ns_search_metadata"
                 "  (target_name, last_update, latest_id)"
-                "  VALUES (%(target)s, %(last_update)s, %(latest_id)s)"
-                "  ON DUPLICATE KEY UPDATE"
-                "    last_update = %(last_update)s"
+                "  VALUES (:target, :last_update, :latest_id)"
             )
 
-        db = newslash_db.NewslashDB(self._db_config())
-        cur = db.execute(sql, target=target, last_update=last_update, latest_id=latest_id)
+        db = self._get_db()
+        try:
+            cur = db.execute(insert_sql, target=target, last_update=last_update, latest_id=latest_id)
+            cur = db.execute(update_sql, target=target, last_update=last_update, latest_id=latest_id)
+        except sqlite3.OperationalError as e:
+            db.close()
+            raise exceptions.DatabaseError('ns_search_metadata update error: {}'.format(str(e)))
+
         db.close()
 
index e071bc0..2417245 100644 (file)
@@ -101,7 +101,8 @@ class Root(Route):
         #if req.environ['wsgi.errors']:
         #req.environ['wsgi.errors'].write(query.encode('utf-8'))
 
-        searcher = lucene_wrapper.Searcher(index_directory=self.config('Searchd', 'index_path'))
+        index_dir = os.path.join(self.config('Searchd', 'index_path'), 'lucene_index')
+        searcher = lucene_wrapper.Searcher(index_directory=index_dir)
         try:
             content_query = lucene_wrapper.Query("content_text", query_text)
             title_query = lucene_wrapper.Query("title", query_text)
index b0e76db..f97bc6f 100755 (executable)
@@ -19,8 +19,6 @@ import lucene_wrapper
 from newslash_index import Index, DatabaseError
 from newslash_db import NewslashDB
 
-CONFIG_FILE="./.config.yml"
-
 class SearchCLIError(Exception):
     def __init__(self, message):
         self.message = message
@@ -96,7 +94,7 @@ class SearchCLI(object):
         try:
             fh = open(pathname)
         except IOError:
-            sys.stderr.write('config file ({}) not found...'.format(CONFIG_FILE))
+            sys.stderr.write('config file ({}) not found...'.format(pathname))
             self.config = {}
             return
 
index d33e29a..ebbec0c 100644 (file)
@@ -3,14 +3,24 @@ IMAGE_NAME=newslash-searchd
 CONTAINER_NAME=newslash-searchd
 
 PORT_OPTS=-p 6000:6000
-LINK_OPTS=--link srad-db:srad-test-db --link newslash-redis:newslash-redis
-CERTS_OPTS=-v /etc/letsencrypt:/var/certs
-ENV=-e "MOJO_LISTEN=https://*:3000?cert=/var/certs/live/sdtest.osdn.co.jp/cert.pem&key=/var/certs/live/sdtest.osdn.co.jp/privkey.pem"
+LINK_OPTS=--link srad-db:srad-test-db \
+          --link newslash-redis:newslash-redis \
+         --link newslash-db:newslash-db 
+MOUNT_OPTS=-v /fioa1/newslash-searchd:/var/lucene_index
+COMMAND=python /var/newslash/src/ns_search/searchd.py
 
+HOST:=$(shell docker inspect $(CONTAINER_NAME) --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $(NAME) | uniq)
 
 image: Dockerfile files
        docker build $(DOCKER_OPTS) -f $< -t $(IMAGE_NAME) .
 
+run-daemon:
+       docker run -d -n $(CONTAINER_NAME) $(PORT_OPTS) $(MOUNT_OPTS) $(LINK_OPTS) -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) $(COMMAND)
+
+restart:
+       docker restart $(CONTAINER_NAME)
 run:
-       docker run -ti --rm -p 6000:6000 -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) bash
+       docker run -ti --rm  $(PORT_OPTS) $(MOUNT_OPTS) $(LINK_OPTS) -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) bash
 
+test-query:
+       curl http://$(HOST):6000/ -X POST -H 'Content-Type: application/json' -d '{"query":"firefox", "target":"story"}'