”’
参考借鉴了若干网站文章
在此一致致谢
”’
import urllib from HTMLParser import HTMLParser url = "http://www.91tfboys.com" class MyHTMLParser(HTMLParser): """Withdraw urls from html""" def __init__(self): HTMLParser.__init__(self) self.links = '' self.text = '' self.items = [] self.flag = 0 def handle_starttag(self, tag, attrs): if tag == 'a': if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable == 'href': if value.startswith('/'): value = url + value self.links = value self.flag = 1 def handle_data(self, data): if self.flag == 1: self.text = data.strip() self.items.append((self.text, self.links)) self.flag = 0 if __name__ == '__main__': webdata = urllib.urlopen(url).read() hp = MyHTMLParser() hp.feed(webdata) hp.close() file = file('17.txt', 'w') t = 0 while t < len(hp.items): file.write(hp.items[t][0] + hp.items[t][1] + '\r\n') t += 1 file.close()
”’
忏愧忏愧
弄到现在
”’