import newslash_db
import lucene_wrapper
+from htmlutil import strip_html_tag
from yaml import load
try:
continue
timestamp = calendar.timegm(item["time"].utctimetuple())
- content_text = item["introtext"] + item["bodytext"]
+ (content_text, urls) = strip_html_tag(item["introtext"] + item["bodytext"])
doc.add_string_field("type", "story")
doc.add_string_field("id", item["sid"])
+
doc.add_text_field("title", item["title"])
doc.add_text_field("content_text", content_text)
doc.add_text_field("dept", item["dept"])
+
doc.add_int_field("create_time", timestamp)
doc.add_int_field("topic", item["tid"])
doc.add_int_field("author", item["uid"])
doc.add_int_field("submitter", item["submitter"])
+ for url in urls:
+ doc.add_string_field("url", url)
+
print("index {}...".format(item["sid"]))
indexer.add(doc)
print("indexing done. total time: {}s".format(time.time() - start_time))
--- /dev/null
+# -*- coding: utf-8
+
+from HTMLParser import HTMLParser
+from htmlentitydefs import name2codepoint
+
+class HTMLTagStripper(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.result = []
+ self.urls = []
+
+ def handle_data(self, data):
+ self.result.append(data)
+
+ def handle_entityref(self, name):
+ c = unichr(name2codepoint[name])
+ self.result.append(c)
+
+ def handle_starttag(self, tag, attrs):
+ if tag != 'a':
+ return
+ for (attr, val) in attrs:
+ if attr == 'href':
+ self.urls.append(val)
+
+ def handle_charref(self, name):
+ if name.startswith('x'):
+ c = unichr(int(name[1:], 16))
+ else:
+ c = unichr(int(name))
+ self.result.append(c)
+
+
+def strip_html_tag(html):
+ p = HTMLTagStripper()
+ p.feed(html)
+ return ("".join(p.result), p.urls)
+