其实如果blogbus能提供一个完整的导出工具,我就不需要自己花1个小时去写和测试这个代码。文件头尾需要加一下才是完整的rss文件。然后用wordpress的RSS importer就可以了。
另外,如果正文中带有链接,而链接中又含有非ascii字符,HTML parser也会失败。
export.py内容如下:
#!/usr/bin/env python
# Author:polo@live.cn
# coding=utf-8
import httplib
import re
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return '\n\n'.join(self.fed)
#strip html tags
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
#fetch a blog's meta data, and append to xml file, and return the previous blog's url
def fetch_blog(surl):
conn = httplib.HTTPConnection('zhengrenchi.blogbus.com')
conn.request('GET', surl)
rep = conn.getresponse()
preurl = ''
if rep.status == 200 :
content = rep.read()
content = content.replace('\n', '')
content = content.replace('\r', '')
r = re.compile('postHeader">.*?<h2>(.*?)</h2><h3>(.*?) \| Tag:(.*?)</h3>.*?</div>')
s_match = r.findall(content)
for k1 in s_match:
title = k1[0]
time = k1[1]
tags = k1[2]
r = re.compile('<a href.*?>(.*?)</a>')
tagsa = r.findall(tags)
r = re.compile('<div>.*?<p.*?</p>(.*?)<div>')
r1 = re.compile('<div>.*?<p.*?</p>(.*?)<div>')
s_match = r.findall(content)
if s_match:
for k1 in s_match:
body = k1
else:
s_match = r1.findall(content)
for k1 in s_match:
body = k1
r = re.compile('<span><a href=\'(.*?)\'>')
s_match = r.findall(content)
for k1 in s_match:
preurl = k1
outfile = open('blog.xml','a+')
outfile.write('<item>')
outfile.write('<title>' + title + '</title>')
outfile.write('<pubdate>' + time + '</pubdate>')
#print body
outfile.write('<description><![CDATA[' + strip_tags(body) + ']]></description>')
for k2 in tagsa:
outfile.write('<category>' + k2 +'</category>')
outfile.write('</item>')
outfile.close()
conn.close()
return preurl
#set the original url
xurl = '/logs/228488258.html'
while (xurl != ''):
print xurl
xurl = fetch_blog(xurl)