]>
gitweb.pimeys.fr Git - dtc.git/blob - html2text.py
2 """html2text: Turn HTML into equivalent Markdown-structured text."""
4 __author__
= "Aaron Swartz (me@aaronsw.com)"
5 __copyright__
= "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__
= ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
9 # Support decoded entities with unifiable.
14 setattr(__builtins__
, 'True', 1)
15 setattr(__builtins__
, 'False', 0)
18 if hasattr(x
, 'has_key'): return x
.has_key(y
)
25 except ImportError: #Python3
26 import html
.entities
as htmlentitydefs
27 import urllib
.parse
as urlparse
28 import html
.parser
as HTMLParser
30 import urllib
.request
as urllib
33 import optparse
, re
, sys
, codecs
, types
35 try: from textwrap
import wrap
38 # Use Unicode characters instead of their ascii psuedo-replacements
41 # Put the links after each paragraph instead of at the end.
42 LINKS_EACH_PARAGRAPH
= 0
44 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
47 # Don't show internal links (href="#local-anchor") -- corresponding link targets
48 # won't be visible in the plain text file anyway.
49 SKIP_INTERNAL_LINKS
= True
51 # Use inline, rather than reference, formatting for images and links
54 # Number of pixels Google indents nested lists
55 GOOGLE_LIST_INDENT
= 36
57 IGNORE_ANCHORS
= False
60 ### Entity Nonsense ###
63 if k
== 'apos': return ord("'")
64 if hasattr(htmlentitydefs
, "name2codepoint"): # requires Python 2.3
65 return htmlentitydefs
.name2codepoint
[k
]
67 k
= htmlentitydefs
.entitydefs
[k
]
68 if k
.startswith("&#") and k
.endswith(";"): return int(k
[2:-1]) # not in latin-1
69 return ord(codecs
.latin_1_decode(k
)[0])
71 unifiable
= {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
72 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
73 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
74 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
75 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
76 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
77 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
78 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
83 for k
in unifiable
.keys():
84 unifiable_n
[name2cp(k
)] = unifiable
[k
]
87 if name
[0] in ['x','X']:
92 if not UNICODE_SNOB
and c
in unifiable_n
.keys():
97 except NameError: #Python3
101 if not UNICODE_SNOB
and c
in unifiable
.keys():
105 except KeyError: return "&" + c
+ ';'
108 return unichr(name2cp(c
))
109 except NameError: #Python3
110 return chr(name2cp(c
))
112 def replaceEntities(s
):
115 return charref(s
[1:])
116 else: return entityref(s
)
118 r_unescape
= re
.compile(r
"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
120 return r_unescape
.sub(replaceEntities
, s
)
122 ### End Entity Nonsense ###
125 """Return true if the line does only consist of whitespace characters."""
127 if c
is not ' ' and c
is not ' ':
132 """Wrap all paragraphs in the provided text."""
136 assert wrap
, "Requires Python 2.3."
139 for para
in text
.split("\n"):
141 if para
[0] != ' ' and para
[0] != '-' and para
[0] != '*':
142 for line
in wrap(para
, BODY_WIDTH
):
143 result
+= line
+ "\n"
147 if not onlywhite(para
):
148 result
+= para
+ "\n"
157 if tag
[0] == 'h' and len(tag
) == 2:
160 if n
in range(1, 10): return n
161 except ValueError: return 0
163 def dumb_property_dict(style
):
164 """returns a hash of css attributes"""
165 return dict([(x
.strip(), y
.strip()) for x
, y
in [z
.split(':', 1) for z
in style
.split(';') if ':' in z
]]);
167 def dumb_css_parser(data
):
168 """returns a hash of css selectors, each of which contains a hash of css attributes"""
169 # remove @import sentences
170 importIndex
= data
.find('@import')
171 while importIndex
!= -1:
172 data
= data
[0:importIndex
] + data
[data
.find(';', importIndex
) + 1:]
173 importIndex
= data
.find('@import')
175 # parse the css. reverted from dictionary compehension in order to support older pythons
176 elements
= [x
.split('{') for x
in data
.split('}') if '{' in x
.strip()]
177 elements
= dict([(a
.strip(), dumb_property_dict(b
)) for a
, b
in elements
])
181 def element_style(attrs
, style_def
, parent_style
):
182 """returns a hash of the 'final' style attributes of the element"""
183 style
= parent_style
.copy()
185 for css_class
in attrs
['class'].split():
186 css_style
= style_def
['.' + css_class
]
187 style
.update(css_style
)
189 immediate_style
= dumb_property_dict(attrs
['style'])
190 style
.update(immediate_style
)
193 def google_list_style(style
):
194 """finds out whether this is an ordered or unordered list"""
195 if 'list-style-type' in style
:
196 list_style
= style
['list-style-type']
197 if list_style
in ['disc', 'circle', 'square', 'none']:
201 def google_nest_count(style
):
202 """calculate the nesting count of google doc lists"""
204 if 'margin-left' in style
:
205 nest_count
= int(style
['margin-left'][:-2]) / GOOGLE_LIST_INDENT
208 def google_has_height(style
):
209 """check if the style of the element has the 'height' attribute explicitly defined"""
210 if 'height' in style
:
214 def google_text_emphasis(style
):
215 """return a list of all emphasis modifiers of the element"""
217 if 'text-decoration' in style
:
218 emphasis
.append(style
['text-decoration'])
219 if 'font-style' in style
:
220 emphasis
.append(style
['font-style'])
221 if 'font-weight' in style
:
222 emphasis
.append(style
['font-weight'])
225 def google_fixed_width_font(style
):
226 """check if the css of the current element defines a fixed width font"""
228 if 'font-family' in style
:
229 font_family
= style
['font-family']
230 if 'Courier New' == font_family
or 'Consolas' == font_family
:
234 def list_numbering_start(attrs
):
235 """extract numbering from list element attributes"""
237 return int(attrs
['start']) - 1
241 class _html2text(HTMLParser
.HTMLParser
):
242 def __init__(self
, out
=None, baseurl
=''):
243 HTMLParser
.HTMLParser
.__init
__(self
)
245 if out
is None: self
.out
= self
.outtextf
247 self
.outtextlist
= [] # empty list to store output characters before they are "joined"
249 self
.outtext
= unicode()
250 except NameError: # Python3
253 self
.p_p
= 0 # number of newline character to print before next output
267 self
.lastWasList
= False
272 self
.drop_white_space
= 0
273 self
.inheader
= False
274 self
.abbr_title
= None # current abbreviation definition
275 self
.abbr_data
= None # last inner HTML (for abbr being defined)
276 self
.abbr_list
= {} # stack of abbreviations to write later
277 self
.baseurl
= baseurl
279 if options
.google_doc
:
280 del unifiable_n
[name2cp('nbsp')]
281 unifiable
['nbsp'] = ' _place_holder;'
283 def feed(self
, data
):
284 data
= data
.replace("</' + 'script>", "</ignore>")
285 HTMLParser
.HTMLParser
.feed(self
, data
)
287 def outtextf(self
, s
):
288 self
.outtextlist
.append(s
)
289 if s
: self
.lastWasNL
= s
[-1] == '\n'
292 HTMLParser
.HTMLParser
.close(self
)
297 self
.outtext
= self
.outtext
.join(self
.outtextlist
)
299 if options
.google_doc
:
300 self
.outtext
= self
.outtext
.replace(' _place_holder;', ' ');
304 def handle_charref(self
, c
):
305 self
.o(charref(c
), 1)
307 def handle_entityref(self
, c
):
308 self
.o(entityref(c
), 1)
310 def handle_starttag(self
, tag
, attrs
):
311 self
.handle_tag(tag
, attrs
, 1)
313 def handle_endtag(self
, tag
):
314 self
.handle_tag(tag
, None, 0)
316 def previousIndex(self
, attrs
):
317 """ returns the index of certain set of attributes (of a link) in the
320 If the set of attributes is not found, returns None
322 if not has_key(attrs
, 'href'): return None
329 if has_key(a
, 'href') and a
['href'] == attrs
['href']:
330 if has_key(a
, 'title') or has_key(attrs
, 'title'):
331 if (has_key(a
, 'title') and has_key(attrs
, 'title') and
332 a
['title'] == attrs
['title']):
339 def drop_last(self
, nLetters
):
341 self
.outtext
= self
.outtext
[:-nLetters
]
343 def handle_emphasis(self
, start
, tag_style
, parent_style
):
344 """handles various text emphases"""
345 tag_emphasis
= google_text_emphasis(tag_style
)
346 parent_emphasis
= google_text_emphasis(parent_style
)
348 # handle Google's text emphasis
349 strikethrough
= 'line-through' in tag_emphasis
and options
.hide_strikethrough
350 bold
= 'bold' in tag_emphasis
and not 'bold' in parent_emphasis
351 italic
= 'italic' in tag_emphasis
and not 'italic' in parent_emphasis
352 fixed
= google_fixed_width_font(tag_style
) and not \
353 google_fixed_width_font(parent_style
) and not self
.pre
356 # crossed-out text must be handled before other attributes
357 # in order not to output qualifiers unnecessarily
358 if bold
or italic
or fixed
:
364 self
.drop_white_space
+= 1
367 self
.drop_white_space
+= 1
370 self
.drop_white_space
+= 1
373 if bold
or italic
or fixed
:
374 # there must not be whitespace before closing emphasis mark
377 self
.outtext
= self
.outtext
.rstrip()
379 if self
.drop_white_space
:
380 # empty emphasis, drop it
382 self
.drop_white_space
-= 1
387 if self
.drop_white_space
:
388 # empty emphasis, drop it
390 self
.drop_white_space
-= 1
394 if self
.drop_white_space
:
395 # empty emphasis, drop it
397 self
.drop_white_space
-= 1
400 # space is only allowed after *all* emphasis marks
401 if (bold
or italic
) and not self
.emphasis
:
406 def handle_tag(self
, tag
, attrs
, start
):
407 #attrs = fixattrs(attrs)
413 if options
.google_doc
:
414 # the attrs parameter is empty for a closing tag. in addition, we
415 # need the attributes of the parent nodes in order to get a
416 # complete style description for the current element. we assume
417 # that google docs export well formed html.
421 parent_style
= self
.tag_stack
[-1][2]
422 tag_style
= element_style(attrs
, self
.style_def
, parent_style
)
423 self
.tag_stack
.append((tag
, attrs
, tag_style
))
425 dummy
, attrs
, tag_style
= self
.tag_stack
.pop()
427 parent_style
= self
.tag_stack
[-1][2]
433 self
.o(hn(tag
)*"#" + ' ')
435 self
.inheader
= False
436 return # prevent redundant emphasis marks on headers
438 if tag
in ['p', 'div']:
439 if options
.google_doc
:
440 if start
and google_has_height(tag_style
):
447 if tag
== "br" and start
: self
.o(" \n")
449 if tag
== "hr" and start
:
454 if tag
in ["head", "style", 'script']:
455 if start
: self
.quiet
+= 1
456 else: self
.quiet
-= 1
459 if start
: self
.style
+= 1
460 else: self
.style
-= 1
463 self
.quiet
= 0 # sites like 9rules.com never close <head>
465 if tag
== "blockquote":
467 self
.p(); self
.o('> ', 0, 1); self
.start
= 1
473 if tag
in ['em', 'i', 'u']: self
.o("_")
474 if tag
in ['strong', 'b']: self
.o("**")
475 if tag
in ['del', 'strike']:
481 if options
.google_doc
:
482 if not self
.inheader
:
483 # handle some font attributes, but leave headers clean
484 self
.handle_emphasis(start
, tag_style
, parent_style
)
486 if tag
== "code" and not self
.pre
: self
.o('`') #TODO: `` `this` ``
489 self
.abbr_title
= None
491 if has_key(attrs
, 'title'):
492 self
.abbr_title
= attrs
['title']
494 if self
.abbr_title
!= None:
495 self
.abbr_list
[self
.abbr_data
] = self
.abbr_title
496 self
.abbr_title
= None
499 if tag
== "a" and not IGNORE_ANCHORS
:
501 if has_key(attrs
, 'href') and not (SKIP_INTERNAL_LINKS
and attrs
['href'].startswith('#')):
502 self
.astack
.append(attrs
)
505 self
.astack
.append(None)
508 a
= self
.astack
.pop()
511 self
.o("](" + a
['href'] + ")")
513 i
= self
.previousIndex(a
)
518 a
['count'] = self
.acount
519 a
['outcount'] = self
.outcount
521 self
.o("][" + str(a
['count']) + "]")
523 if tag
== "img" and start
and not IGNORE_IMAGES
:
524 if has_key(attrs
, 'src'):
525 attrs
['href'] = attrs
['src']
526 alt
= attrs
.get('alt', '')
530 self
.o("]("+ attrs
['href'] +")")
532 i
= self
.previousIndex(attrs
)
537 attrs
['count'] = self
.acount
538 attrs
['outcount'] = self
.outcount
542 self
.o("]["+ str(attrs
['count']) +"]")
544 if tag
== 'dl' and start
: self
.p()
545 if tag
== 'dt' and not start
: self
.pbr()
546 if tag
== 'dd' and start
: self
.o(' ')
547 if tag
== 'dd' and not start
: self
.pbr()
549 if tag
in ["ol", "ul"]:
550 # Google Docs create sub lists as top level lists
551 if (not self
.list) and (not self
.lastWasList
):
554 if options
.google_doc
:
555 list_style
= google_list_style(tag_style
)
558 numbering_start
= list_numbering_start(attrs
)
559 self
.list.append({'name':list_style
, 'num':numbering_start
})
561 if self
.list: self
.list.pop()
562 self
.lastWasList
= True
564 self
.lastWasList
= False
569 if self
.list: li
= self
.list[-1]
570 else: li
= {'name':'ul', 'num':0}
571 if options
.google_doc
:
572 nest_count
= google_nest_count(tag_style
)
574 nest_count
= len(self
.list)
575 self
.o(" " * nest_count
) #TODO: line up <ol><li>s > 9 correctly.
576 if li
['name'] == "ul": self
.o(options
.ul_item_mark
+ " ")
577 elif li
['name'] == "ol":
579 self
.o(str(li
['num'])+". ")
582 if tag
in ["table", "tr"] and start
: self
.p()
583 if tag
== 'td': self
.pbr()
594 if self
.p_p
== 0: self
.p_p
= 1
596 def p(self
): self
.p_p
= 2
602 def o(self
, data
, puredata
=0, force
=0):
603 if self
.abbr_data
is not None: self
.abbr_data
+= data
606 if options
.google_doc
:
607 # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
608 lstripped_data
= data
.lstrip()
609 if self
.drop_white_space
and not (self
.pre
or self
.code
):
610 data
= lstripped_data
611 if lstripped_data
!= '':
612 self
.drop_white_space
= 0
614 if puredata
and not self
.pre
:
615 data
= re
.sub('\s+', ' ', data
)
616 if data
and data
[0] == ' ':
619 if not data
and not force
: return
622 #self.out(" :") #TODO: not output when already one there
625 bq
= (">" * self
.blockquote
)
626 if not (force
and data
and data
[0] == ">") and self
.blockquote
: bq
+= " "
630 data
= data
.replace("\n", "\n"+bq
)
644 self
.out((self
.br_toggle
+'\n'+bq
)*self
.p_p
)
649 if not self
.lastWasNL
: self
.out(' ')
652 if self
.a
and ((self
.p_p
== 2 and LINKS_EACH_PARAGRAPH
) or force
== "end"):
653 if force
== "end": self
.out("\n")
657 if self
.outcount
> link
['outcount']:
658 self
.out(" ["+ str(link
['count']) +"]: " + urlparse
.urljoin(self
.baseurl
, link
['href']))
659 if has_key(link
, 'title'): self
.out(" ("+link
['title']+")")
664 if self
.a
!= newa
: self
.out("\n") # Don't need an extra line when nothing was done.
668 if self
.abbr_list
and force
== "end":
669 for abbr
, definition
in self
.abbr_list
.items():
670 self
.out(" *[" + abbr
+ "]: " + definition
+ "\n")
676 def handle_data(self
, data
):
677 if r
'\/script>' in data
: self
.quiet
-= 1
680 self
.style_def
.update(dumb_css_parser(data
))
684 def unknown_decl(self
, data
): pass
687 text
= text
.encode('utf-8')
689 sys
.stdout
.buffer.write(text
)
690 except AttributeError:
691 sys
.stdout
.write(text
)
693 def html2text_file(html
, out
=wrapwrite
, baseurl
=''):
694 h
= _html2text(out
, baseurl
)
699 def html2text(html
, baseurl
=''):
700 return optwrap(html2text_file(html
, None, baseurl
))
704 options
.google_doc
= False
705 options
.ul_item_mark
= '*'
707 if __name__
== "__main__":
710 p
= optparse
.OptionParser('%prog [(filename|url) [encoding]]',
711 version
='%prog ' + __version__
)
712 p
.add_option("-g", "--google-doc", action
="store_true", dest
="google_doc",
713 default
=False, help="convert an html-exported Google Document")
714 p
.add_option("-d", "--dash-unordered-list", action
="store_true", dest
="ul_style_dash",
715 default
=False, help="use a dash rather than a star for unordered list items")
716 p
.add_option("-b", "--body-width", dest
="body_width", action
="store", type="int",
717 default
=78, help="number of characters per output line, 0 for no wrap")
718 p
.add_option("-i", "--google-list-indent", dest
="list_indent", action
="store", type="int",
719 default
=GOOGLE_LIST_INDENT
, help="number of pixels Google indents nested lists")
720 p
.add_option("-s", "--hide-strikethrough", action
="store_true", dest
="hide_strikethrough",
721 default
=False, help="hide strike-through text. only relevent when -g is specified as well")
722 (options
, args
) = p
.parse_args()
725 if options
.ul_style_dash
:
726 options
.ul_item_mark
= '-'
728 options
.ul_item_mark
= '*'
730 BODY_WIDTH
= options
.body_width
731 GOOGLE_LIST_INDENT
= options
.list_indent
740 p
.error('Too many arguments')
742 if file_
.startswith('http://') or file_
.startswith('https://'):
744 j
= urllib
.urlopen(baseurl
)
748 from feedparser
import _getCharacterEncoding
as enc
750 enc
= lambda x
, y
: ('utf-8', 1)
751 encoding
= enc(j
.headers
, text
)[0]
752 if encoding
== 'us-ascii':
754 data
= text
.decode(encoding
)
757 data
= open(file_
, 'rb').read()
760 from chardet
import detect
762 detect
= lambda x
: {'encoding': 'utf-8'}
763 encoding
= detect(data
)['encoding']
764 data
= data
.decode(encoding
)
766 data
= sys
.stdin
.read()
767 wrapwrite(html2text(data
, baseurl
))