From a9719994a440688b1166bc0ab9591597be55c2ec Mon Sep 17 00:00:00 2001 From: hylom Date: Thu, 6 Jun 2019 16:25:16 +0000 Subject: [PATCH] ns_search: change search index structure - store metadata to file --- src/ns_search/newslash_index/exceptions.py | 1 + src/ns_search/newslash_index/index.py | 14 ++- src/ns_search/newslash_index/metadata.py | 141 ++++++++++++++++++++++------- src/ns_search/searchd.py | 3 +- src/ns_search/searchd_cli.py | 4 +- test-container/ns-searchd/Makefile | 18 +++- 6 files changed, 134 insertions(+), 47 deletions(-) diff --git a/src/ns_search/newslash_index/exceptions.py b/src/ns_search/newslash_index/exceptions.py index 9a9222d1..4977d7da 100644 --- a/src/ns_search/newslash_index/exceptions.py +++ b/src/ns_search/newslash_index/exceptions.py @@ -8,6 +8,7 @@ class NewslashIndexError(Exception): class ConfigFileError(NewslashIndexError): SECTION_NOT_FOUND = "Section not found" + PARAMETER_NOT_FOUND = "Parameter not found" def __init__(self, reason, section=""): self.message = "Config Error - {}: {}".format(reason, section) diff --git a/src/ns_search/newslash_index/index.py b/src/ns_search/newslash_index/index.py index 38e52410..2fbaa963 100644 --- a/src/ns_search/newslash_index/index.py +++ b/src/ns_search/newslash_index/index.py @@ -3,6 +3,7 @@ import calendar from datetime import datetime +import os.path import exceptions import lucene_wrapper @@ -24,9 +25,12 @@ class Index(object): self.config[k] = kwargs[k] self.metadata = Metadata(self.config) + p = self.config.get("index_path") + self._index_path = os.path.join(p, "lucene_index"); + def _get_searcher(self): - return lucene_wrapper.Searcher(index_directory=self.config.get("index_path")) + return lucene_wrapper.Searcher(index_directory=self._index_path) def _db_config(self): db_cfg = self.config.get("database") @@ -49,12 +53,12 @@ class Index(object): if indexer: indexer.delete(q) else: - with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer: + with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer: indexer.delete(q) def delete_all(self): '''delete all document from index''' - with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer: + with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer: indexer.delete_all() def _get_unique_id(self, target, item): @@ -88,7 +92,7 @@ class Index(object): max_unique_id = latest_id force_exit = False - with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer: + with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer: try: while not query_done: items = selector(batch_size, 0, max_unique_id) @@ -148,7 +152,7 @@ class Index(object): update_success = 0 update_errors = 0 offset = 0 - with lucene_wrapper.Indexer(index_directory=self.config.get("index_path")) as indexer: + with lucene_wrapper.Indexer(index_directory=self._index_path) as indexer: items = selector(batch_size, offset, latest_id, last_update) offset += len(items) if len(items) < batch_size: diff --git a/src/ns_search/newslash_index/metadata.py b/src/ns_search/newslash_index/metadata.py index 470bcd27..7153ebfc 100644 --- a/src/ns_search/newslash_index/metadata.py +++ b/src/ns_search/newslash_index/metadata.py @@ -2,57 +2,99 @@ """metadata: index metadata for newslash search system""" import newslash_db -from mysql.connector.errors import ProgrammingError +import sqlite3 +import os.path +#from mysql.connector.errors import ProgrammingError import exceptions +class SQLiteDB(object): + def __init__(self, path): + self._path = path + self._conn = None + + def connect(self): + self._conn = sqlite3.connect(self._path); + #self._conn.autocommit = True + return self + + def cursor(self): + if not self._conn: + self.connect() + cur = self._conn.cursor() + return cur + + def execute(self, query, **kwargs): + cur = self.cursor() + cur.execute(query, kwargs) + return cur + + def commit(self): + self._conn.commit() + return self + + def rollback(self): + self._conn.rollback() + return self + + def close(self): + if self._conn: + self.commit() + self._conn.close() + self._conn = None + return self + + class Metadata(object): def __init__(self, config): self._config = config + path = config.get("index_path") + if not path: + raise exceptions.ConfigFileError(exceptions.ConfigFileError.PARAMETER_NOT_FOUND, "index_path") + + self._db_path = os.path.join(path, "_ns_metadata.db"); - def _db_config(self): - db_cfg = self._config.get("database") - if db_cfg is None: - raise exceptions.ConfigFileError(exceptions.ConfigFileError.SECTION_NOT_FOUND, "Database") - return db_cfg - + def _get_db(self): + return SQLiteDB(self._db_path); + def get_current_timestamp(self): - db = newslash_db.NewslashDB(self._db_config()) - cur = db.execute('SELECT NOW()') - return cur.fetchone()[0] + db = self._get_db() + cur = db.execute("SELECT datetime('now')") + r = cur.fetchone()[0] + db.close() + return r def create_table(self): sql = ( "CREATE TABLE ns_search_metadata (" - " target_id tinyint(8) unsigned NOT NULL AUTO_INCREMENT," - " target_name varchar(32) NOT NULL UNIQUE," - " last_update timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP," - " latest_id int(8) unsigned NOT NULL DEFAULT 0," - " PRIMARY KEY (target_id)" + " target_id INTEGER PRIMARY KEY AUTOINCREMENT," + " target_name TEXT NOT NULL UNIQUE," + " last_update TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP," + " latest_id INTEGER NOT NULL DEFAULT 0" ")" ) - db= newslash_db.NewslashDB(self._db_config()) + db = self._get_db() try: db.execute(sql) - except ProgrammingError as e: + except sqlite3.OperationalError as e: db.close() raise exceptions.DatabaseError('table creation error: {}'.format(str(e))) db.close() def drop_table(self): - db = newslash_db.NewslashDB(self._db_config()) + db = self._get_db() db.execute('DROP TABLE ns_search_metadata') db.close() def delete_all(self): - db = newslash_db.NewslashDB(self._db_config()) + db = self._get_db() db.execute('DELETE FROM ns_search_metadata') db.close() def get(self, target): - sql = 'SELECT * from ns_search_metadata WHERE target_name = %(target)s' - db = newslash_db.NewslashDB(self._db_config()) + sql = 'SELECT * from ns_search_metadata WHERE target_name = :target' + db = self._get_db() cur = db.execute(sql, target=target) if cur.rowcount > 0: result = dict(zip(cur.column_names, cur.fetchone())) @@ -63,25 +105,56 @@ class Metadata(object): return result def update(self, target, last_update, latest_id): + # sqlite 3.24 or after, can use ON CONFLICT, but current package's version + # not supported... + # if latest_id > 0: + # sql = ( + # "INSERT INTO ns_search_metadata" + # " (target_name, last_update, latest_id)" + # " VALUES (:target, :last_update, :latest_id)" + # " ON CONFLICT (target_name) DO UPDATE SET" + # " last_update = :last_update," + # " latest_id = :latest_id" + # ) + # else: + # sql = ( + # "INSERT INTO ns_search_metadata" + # " (target_name, last_update, latest_id)" + # " VALUES (:target, :last_update, :latest_id)" + # " ON CONFLIT (target_name) DO UPDATE SET" + # " last_update = :last_update" + # ) if latest_id > 0: - sql = ( - "INSERT INTO ns_search_metadata" + update_sql = ( + "UPDATE OR IGNORE ns_search_metadata" + " SET last_update = :last_update," + " latest_id = :latest_id" + " WHERE target_name = :target" + ) + insert_sql = ( + "INSERT OR IGNORE INTO ns_search_metadata" " (target_name, last_update, latest_id)" - " VALUES (%(target)s, %(last_update)s, %(latest_id)s)" - " ON DUPLICATE KEY UPDATE" - " last_update = %(last_update)s," - " latest_id = %(latest_id)s" + " VALUES (:target, :last_update, :latest_id)" ) else: - sql = ( - "INSERT INTO ns_search_metadata" + update_sql = ( + "UPDATE OR IGNORE ns_search_metadata" + " SET last_update = :last_update," + " WHERE target_name = :target" + ) + insert_sql = ( + "INSERT OR IGNORE INTO ns_search_metadata" " (target_name, last_update, latest_id)" - " VALUES (%(target)s, %(last_update)s, %(latest_id)s)" - " ON DUPLICATE KEY UPDATE" - " last_update = %(last_update)s" + " VALUES (:target, :last_update, :latest_id)" ) - db = newslash_db.NewslashDB(self._db_config()) - cur = db.execute(sql, target=target, last_update=last_update, latest_id=latest_id) + db = self._get_db() + try: + cur = db.execute(insert_sql, target=target, last_update=last_update, latest_id=latest_id) + cur = db.execute(update_sql, target=target, last_update=last_update, latest_id=latest_id) + except sqlite3.OperationalError as e: + db.close() + raise exceptions.DatabaseError('ns_search_metadata update error: {}'.format(str(e))) + db.close() diff --git a/src/ns_search/searchd.py b/src/ns_search/searchd.py index e071bc02..24172450 100644 --- a/src/ns_search/searchd.py +++ b/src/ns_search/searchd.py @@ -101,7 +101,8 @@ class Root(Route): #if req.environ['wsgi.errors']: #req.environ['wsgi.errors'].write(query.encode('utf-8')) - searcher = lucene_wrapper.Searcher(index_directory=self.config('Searchd', 'index_path')) + index_dir = os.path.join(self.config('Searchd', 'index_path'), 'lucene_index') + searcher = lucene_wrapper.Searcher(index_directory=index_dir) try: content_query = lucene_wrapper.Query("content_text", query_text) title_query = lucene_wrapper.Query("title", query_text) diff --git a/src/ns_search/searchd_cli.py b/src/ns_search/searchd_cli.py index b0e76dbd..f97bc6f4 100755 --- a/src/ns_search/searchd_cli.py +++ b/src/ns_search/searchd_cli.py @@ -19,8 +19,6 @@ import lucene_wrapper from newslash_index import Index, DatabaseError from newslash_db import NewslashDB -CONFIG_FILE="./.config.yml" - class SearchCLIError(Exception): def __init__(self, message): self.message = message @@ -96,7 +94,7 @@ class SearchCLI(object): try: fh = open(pathname) except IOError: - sys.stderr.write('config file ({}) not found...'.format(CONFIG_FILE)) + sys.stderr.write('config file ({}) not found...'.format(pathname)) self.config = {} return diff --git a/test-container/ns-searchd/Makefile b/test-container/ns-searchd/Makefile index d33e29ac..ebbec0c0 100644 --- a/test-container/ns-searchd/Makefile +++ b/test-container/ns-searchd/Makefile @@ -3,14 +3,24 @@ IMAGE_NAME=newslash-searchd CONTAINER_NAME=newslash-searchd PORT_OPTS=-p 6000:6000 -LINK_OPTS=--link srad-db:srad-test-db --link newslash-redis:newslash-redis -CERTS_OPTS=-v /etc/letsencrypt:/var/certs -ENV=-e "MOJO_LISTEN=https://*:3000?cert=/var/certs/live/sdtest.osdn.co.jp/cert.pem&key=/var/certs/live/sdtest.osdn.co.jp/privkey.pem" +LINK_OPTS=--link srad-db:srad-test-db \ + --link newslash-redis:newslash-redis \ + --link newslash-db:newslash-db +MOUNT_OPTS=-v /fioa1/newslash-searchd:/var/lucene_index +COMMAND=python /var/newslash/src/ns_search/searchd.py +HOST:=$(shell docker inspect $(CONTAINER_NAME) --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $(NAME) | uniq) image: Dockerfile files docker build $(DOCKER_OPTS) -f $< -t $(IMAGE_NAME) . +run-daemon: + docker run -d -n $(CONTAINER_NAME) $(PORT_OPTS) $(MOUNT_OPTS) $(LINK_OPTS) -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) $(COMMAND) + +restart: + docker restart $(CONTAINER_NAME) run: - docker run -ti --rm -p 6000:6000 -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) bash + docker run -ti --rm $(PORT_OPTS) $(MOUNT_OPTS) $(LINK_OPTS) -v $(NEWSLASH_DIR):/var/newslash $(IMAGE_NAME) bash +test-query: + curl http://$(HOST):6000/ -X POST -H 'Content-Type: application/json' -d '{"query":"firefox", "target":"story"}' -- 2.11.0