ns_search: strip tags and add URL information when create_sample_index create index

author hylom <hylom@users.sourceforge.jp>

Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)

committer hylom <hylom@users.sourceforge.jp>

Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
author hylom <hylom@users.sourceforge.jp>
Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
committer hylom <hylom@users.sourceforge.jp>
Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
diff --git a/src/ns_search/create_sample_index.py b/src/ns_search/create_sample_index.py

index 0daa195..d3e124c 100644 (file)
--- a/src/ns_search/create_sample_index.py
+++ b/src/ns_search/create_sample_index.py
@@ -5,6 +5,7 @@ import calendar
  
  import newslash_db
  import lucene_wrapper
+from htmlutil import strip_html_tag
  
  from yaml import load
  try:
@@ -31,18 +32,23 @@ def main():
                  continue
  
              timestamp = calendar.timegm(item["time"].utctimetuple())
-            content_text = item["introtext"] + item["bodytext"]
+            (content_text, urls) = strip_html_tag(item["introtext"] + item["bodytext"])
  
              doc.add_string_field("type", "story")
              doc.add_string_field("id", item["sid"])
+
              doc.add_text_field("title", item["title"])
              doc.add_text_field("content_text", content_text)
              doc.add_text_field("dept", item["dept"])
+
              doc.add_int_field("create_time", timestamp)
              doc.add_int_field("topic", item["tid"])
              doc.add_int_field("author", item["uid"])
              doc.add_int_field("submitter", item["submitter"])
  
+            for url in urls:
+                doc.add_string_field("url", url)
+
              print("index {}...".format(item["sid"]))
              indexer.add(doc)
      print("indexing done. total time: {}s".format(time.time() - start_time))
diff --git a/src/ns_search/htmlutil.py b/src/ns_search/htmlutil.py

new file mode 100644 (file)

index 0000000..42acf23
--- /dev/null
+++ b/src/ns_search/htmlutil.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8
+
+from HTMLParser import HTMLParser
+from htmlentitydefs import name2codepoint
+
+class HTMLTagStripper(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.result = []
+        self.urls = []
+
+    def handle_data(self, data):
+        self.result.append(data)
+        
+    def handle_entityref(self, name):
+        c = unichr(name2codepoint[name])
+        self.result.append(c)
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'a':
+            return
+        for (attr, val) in attrs:
+            if attr == 'href':
+                self.urls.append(val)
+        
+    def handle_charref(self, name):
+        if name.startswith('x'):
+            c = unichr(int(name[1:], 16))
+        else:
+            c = unichr(int(name))
+        self.result.append(c)
+
+
+def strip_html_tag(html):
+    p = HTMLTagStripper()
+    p.feed(html)
+    return ("".join(p.result), p.urls)
+
author	hylom <hylom@users.sourceforge.jp>
	Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
committer	hylom <hylom@users.sourceforge.jp>
	Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
src/ns_search/create_sample_index.py		patch \| blob \| history
src/ns_search/htmlutil.py	[new file with mode: 0644]	patch \| blob