OSDN Git Service

ns_search: add Field related classes
authorhylom <hylom@users.sourceforge.jp>
Wed, 23 May 2018 10:15:11 +0000 (19:15 +0900)
committerhylom <hylom@users.sourceforge.jp>
Wed, 23 May 2018 10:15:11 +0000 (19:15 +0900)
src/ns_search/lucene_wrapper/__init__.py
src/ns_search/lucene_wrapper/wrapper.py

index a51e517..6a89dcb 100644 (file)
@@ -8,7 +8,9 @@ from wrapper import (
     init_vm,
     Indexer,
     Searcher,
+    Reader,
     Document,
+    Field,
     Query,
     QueryParseError,
     TermQuery,
index afa4174..c8160da 100644 (file)
@@ -16,6 +16,8 @@ from org.apache.lucene.search import (IndexSearcher,
                                       Sort as _Sort,
                                       SortField,
 )
+from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
+
 from org.apache.lucene.search.highlight import (SimpleHTMLFormatter,
                                                 Highlighter as _Highlighter,
                                                 QueryScorer,
@@ -51,6 +53,15 @@ class LuceneWrapper(object):
     def _get_index_directory(self):
         return SimpleFSDirectory(File(self._config["index_directory"]))
 
+    def _get_searcher(self):
+        self._searcher = IndexSearcher(self._get_reader())
+        return self._searcher
+
+    def _get_reader(self):
+        index_dir = self._get_index_directory()
+        self._reader = IndexReader.open(index_dir)
+        return self._reader
+
 class Indexer(LuceneWrapper):
     def __init__(self, **kwargs):
         super(Indexer, self).__init__(**kwargs)
@@ -127,17 +138,22 @@ class Document(LuceneWrapper):
         self.doc.removeFields(name)
 
 
+class Reader(LuceneWrapper):
+    def __init__(self, **kwargs):
+        super(Reader, self).__init__(**kwargs)
+        self._get_reader()
+
+    def get_term_vector(self, doc_id, field_name):
+        return self._reader.getTermVector(doc_id, field_name)
+
+    def leaves(self):
+        return self._reader.leaves()
+
 class Searcher(LuceneWrapper):
     def __init__(self, **kwargs):
         super(Searcher, self).__init__(**kwargs)
         self._get_searcher()
 
-    def _get_searcher(self):
-        index_dir = self._get_index_directory()
-        reader = IndexReader.open(index_dir)
-        self._searcher = IndexSearcher(reader)
-        return self
-
     def search(self, query, limit=1000, offset=0, sort=None):
         if sort:
             col = TopFieldCollector.create(sort.sort, limit + offset,
@@ -194,6 +210,26 @@ class ScoredDocument(LuceneWrapper):
         val = self._searcher_doc.get(name)
         return val
 
+    def get_fields(self):
+        return [Field(x) for x in self._searcher_doc.getFields()]
+
+
+class Field(LuceneWrapper):
+    def __init__(self, field):
+        super(Field, self).__init__()
+        self._field = field
+        self.name = field.name()
+        self.value = field.stringValue()
+
+    def get_tokens(self):
+        stream = self._field.tokenStream(self._get_analyzer(), None)
+        stream.reset()
+        result = []
+        while stream.incrementToken():
+            term = stream.getAttribute(CharTermAttribute.class_)
+            result.append(term.toString())
+        return result
+
 
 class BooleanQuery(LuceneWrapper):
     def __init__(self):