OSDN Git Service

ns_search: strip tags and add URL information when create_sample_index create index
authorhylom <hylom@users.sourceforge.jp>
Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
committerhylom <hylom@users.sourceforge.jp>
Mon, 26 Feb 2018 10:45:24 +0000 (19:45 +0900)
src/ns_search/create_sample_index.py
src/ns_search/htmlutil.py [new file with mode: 0644]

index 0daa195..d3e124c 100644 (file)
@@ -5,6 +5,7 @@ import calendar
 
 import newslash_db
 import lucene_wrapper
+from htmlutil import strip_html_tag
 
 from yaml import load
 try:
@@ -31,18 +32,23 @@ def main():
                 continue
 
             timestamp = calendar.timegm(item["time"].utctimetuple())
-            content_text = item["introtext"] + item["bodytext"]
+            (content_text, urls) = strip_html_tag(item["introtext"] + item["bodytext"])
 
             doc.add_string_field("type", "story")
             doc.add_string_field("id", item["sid"])
+
             doc.add_text_field("title", item["title"])
             doc.add_text_field("content_text", content_text)
             doc.add_text_field("dept", item["dept"])
+
             doc.add_int_field("create_time", timestamp)
             doc.add_int_field("topic", item["tid"])
             doc.add_int_field("author", item["uid"])
             doc.add_int_field("submitter", item["submitter"])
 
+            for url in urls:
+                doc.add_string_field("url", url)
+
             print("index {}...".format(item["sid"]))
             indexer.add(doc)
     print("indexing done. total time: {}s".format(time.time() - start_time))
diff --git a/src/ns_search/htmlutil.py b/src/ns_search/htmlutil.py
new file mode 100644 (file)
index 0000000..42acf23
--- /dev/null
@@ -0,0 +1,38 @@
+# -*- coding: utf-8
+
+from HTMLParser import HTMLParser
+from htmlentitydefs import name2codepoint
+
+class HTMLTagStripper(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.result = []
+        self.urls = []
+
+    def handle_data(self, data):
+        self.result.append(data)
+        
+    def handle_entityref(self, name):
+        c = unichr(name2codepoint[name])
+        self.result.append(c)
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'a':
+            return
+        for (attr, val) in attrs:
+            if attr == 'href':
+                self.urls.append(val)
+        
+    def handle_charref(self, name):
+        if name.startswith('x'):
+            c = unichr(int(name[1:], 16))
+        else:
+            c = unichr(int(name))
+        self.result.append(c)
+
+
+def strip_html_tag(html):
+    p = HTMLTagStripper()
+    p.feed(html)
+    return ("".join(p.result), p.urls)
+