"""html2text: Turn HTML into equivalent Markdown-structured text.""" __version__ = "2.0, alpha" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2." # TODO: Word wrap. import re, sys, urllib, htmlentitydefs import sgmllib sgmllib.charref = re.compile('([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 ### Entity Nonsense ### def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(k.decode('ISO-8859-1')) unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 'oelig':'oe', 'aelig':'ae', 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} unifiable_n = {} for k in unifiable.keys(): unifiable_n[name2cp(k)] = unifiable[k] def charref(name): if name[0] in ['x','X']: c = int(name[1:], 16) else: c = int(name) if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: return unichr(c).encode('utf8') def entityref(c): if not UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) except KeyError: return "&" + c else: return unichr(name2cp(c)).encode('utf8') def replaceEntities(s): s = s.group(0) if s[0] == "#": return charref(s[1:]) else: return entityref(s) r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") def unescape(s): return r_unescape.sub(replaceEntities, s) def fixattrs(attrs): # Fix bug in sgmllib.py if not attrs: return attrs newattrs = [] for attr in attrs: newattrs.append((attr[0], unescape(attr[1]))) return newattrs ### End Entity Nonsense ### def out(text): sys.stdout.write(text) def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) if n in range(1, 10): return n except ValueError: return False class _html2text(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.quiet = 0 self.p_p = 0 self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.lastWasNL = 0 def close(self): sgmllib.SGMLParser.close(self) self.pbr() self.o('', 0, 'end') def handle_charref(self, c): self.o(charref(c)) def handle_entityref(self, c): self.o(entityref(c)) def unknown_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) def unknown_endtag(self, tag): self.handle_tag(tag, None, 0) def handle_tag(self, tag, attrs, start): attrs = fixattrs(attrs) if hn(tag): self.p() if start: self.o(hn(tag)*"#" + ' ') if tag in ['p', 'div']: self.p() if tag == "br" and start: self.o(" \n") if tag in ["head", "style", 'script']: if start: self.quiet += 1 else: self.quiet -= 1 if tag == "blockquote": if start: self.p(); self.o('> ', 0, 1); self.start = 1 self.blockquote += 1 else: self.blockquote -= 1 self.p() if tag in ['em', 'i', 'u']: self.o("_") if tag in ['strong', 'b']: self.o("**") if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` if tag == "a": if start: attrs = dict(attrs) if attrs.has_key('href'): self.astack.append(attrs) self.o("[") else: self.astack.append(None) else: if self.astack: a = self.astack.pop() if a: self.acount += 1 a['count'] = self.acount a['outcount'] = self.outcount self.o("][" + `a['count']` + "]") self.a.append(a) if tag == "img" and start: self.acount += 1 attrs = dict(attrs) if attrs.has_key('src'): attrs['href'] = attrs['src'] attrs['count'] = self.acount attrs['outcount'] = self.outcount self.a.append(attrs) self.o("![") if attrs.has_key('alt'): self.o(attrs['alt']) self.o("]["+`self.acount`+"]") if tag in ["ol", "ul"]: if start: self.list.append({'name':tag, 'num':0}) else: self.list.pop() self.p() if tag == 'li': if start: self.pbr() if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} self.o(" "*len(self.list)) #TODO: line up