其实如果blogbus能提供一个完整的导出工具,我就不需要自己花1个小时去写和测试这个代码。文件头尾需要加一下才是完整的rss文件。然后用wordpress的RSS importer就可以了。
另外,如果正文中带有链接,而链接中又含有非ascii字符,HTML parser也会失败。
#!/usr/bin/env python # Author:polo@live.cn # coding=utf-8 import httplib import re from HTMLParser import HTMLParser class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return '\n\n'.join(self.fed) #strip html tags def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() #fetch a blog's meta data, and append to xml file, and return the previous blog's url def fetch_blog(surl): conn = httplib.HTTPConnection('zhengrenchi.blogbus.com') conn.request('GET', surl) rep = conn.getresponse() preurl = '' if rep.status == 200 : content = rep.read() content = content.replace('\n', '') content = content.replace('\r', '') r = re.compile('postHeader">.*?<h2>(.*?)</h2><h3>(.*?) \| Tag:(.*?)</h3>.*?</div>') s_match = r.findall(content) for k1 in s_match: title = k1[0] time = k1[1] tags = k1[2] r = re.compile('<a href.*?>(.*?)</a>') tagsa = r.findall(tags) r = re.compile('<div>.*?<p.*?</p>(.*?)<div>') r1 = re.compile('<div>.*?<p.*?</p>(.*?)<div>') s_match = r.findall(content) if s_match: for k1 in s_match: body = k1 else: s_match = r1.findall(content) for k1 in s_match: body = k1 r = re.compile('<span><a href=\'(.*?)\'>') s_match = r.findall(content) for k1 in s_match: preurl = k1 outfile = open('blog.xml','a+') outfile.write('<item>') outfile.write('<title>' + title + '</title>') outfile.write('<pubdate>' + time + '</pubdate>') #print body outfile.write('<description><![CDATA[' + strip_tags(body) + ']]></description>') for k2 in tagsa: outfile.write('<category>' + k2 +'</category>') outfile.write('</item>') outfile.close() conn.close() return preurl #set the original url xurl = '/logs/228488258.html' while (xurl != ''): print xurl xurl = fetch_blog(xurl)