]> gitweb.pimeys.fr Git - dtc.git/blob - html2text.py
Affichage de l'erreur aussi + échec de %s
[dtc.git] / html2text.py
1 #!/usr/bin/env python
2 """html2text: Turn HTML into equivalent Markdown-structured text."""
3 __version__ = "3.1"
4 __author__ = "Aaron Swartz (me@aaronsw.com)"
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
7
8 # TODO:
9 # Support decoded entities with unifiable.
10
11 try:
12 True
13 except NameError:
14 setattr(__builtins__, 'True', 1)
15 setattr(__builtins__, 'False', 0)
16
17 def has_key(x, y):
18 if hasattr(x, 'has_key'): return x.has_key(y)
19 else: return y in x
20
21 try:
22 import htmlentitydefs
23 import urlparse
24 import HTMLParser
25 except ImportError: #Python3
26 import html.entities as htmlentitydefs
27 import urllib.parse as urlparse
28 import html.parser as HTMLParser
29 try: #Python3
30 import urllib.request as urllib
31 except:
32 import urllib
33 import optparse, re, sys, codecs, types
34
35 try: from textwrap import wrap
36 except: pass
37
38 # Use Unicode characters instead of their ascii psuedo-replacements
39 UNICODE_SNOB = 0
40
41 # Put the links after each paragraph instead of at the end.
42 LINKS_EACH_PARAGRAPH = 0
43
44 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
45 BODY_WIDTH = 78
46
47 # Don't show internal links (href="#local-anchor") -- corresponding link targets
48 # won't be visible in the plain text file anyway.
49 SKIP_INTERNAL_LINKS = True
50
51 # Use inline, rather than reference, formatting for images and links
52 INLINE_LINKS = True
53
54 # Number of pixels Google indents nested lists
55 GOOGLE_LIST_INDENT = 36
56
57 IGNORE_ANCHORS = False
58 IGNORE_IMAGES = False
59
60 ### Entity Nonsense ###
61
62 def name2cp(k):
63 if k == 'apos': return ord("'")
64 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
65 return htmlentitydefs.name2codepoint[k]
66 else:
67 k = htmlentitydefs.entitydefs[k]
68 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
69 return ord(codecs.latin_1_decode(k)[0])
70
71 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
72 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
73 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
74 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
75 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
76 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
77 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
78 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
79 'lrm':'', 'rlm':''}
80
81 unifiable_n = {}
82
83 for k in unifiable.keys():
84 unifiable_n[name2cp(k)] = unifiable[k]
85
86 def charref(name):
87 if name[0] in ['x','X']:
88 c = int(name[1:], 16)
89 else:
90 c = int(name)
91
92 if not UNICODE_SNOB and c in unifiable_n.keys():
93 return unifiable_n[c]
94 else:
95 try:
96 return unichr(c)
97 except NameError: #Python3
98 return chr(c)
99
100 def entityref(c):
101 if not UNICODE_SNOB and c in unifiable.keys():
102 return unifiable[c]
103 else:
104 try: name2cp(c)
105 except KeyError: return "&" + c + ';'
106 else:
107 try:
108 return unichr(name2cp(c))
109 except NameError: #Python3
110 return chr(name2cp(c))
111
112 def replaceEntities(s):
113 s = s.group(1)
114 if s[0] == "#":
115 return charref(s[1:])
116 else: return entityref(s)
117
118 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
119 def unescape(s):
120 return r_unescape.sub(replaceEntities, s)
121
122 ### End Entity Nonsense ###
123
124 def onlywhite(line):
125 """Return true if the line does only consist of whitespace characters."""
126 for c in line:
127 if c is not ' ' and c is not ' ':
128 return c is ' '
129 return line
130
131 def optwrap(text):
132 """Wrap all paragraphs in the provided text."""
133 if not BODY_WIDTH:
134 return text
135
136 assert wrap, "Requires Python 2.3."
137 result = ''
138 newlines = 0
139 for para in text.split("\n"):
140 if len(para) > 0:
141 if para[0] != ' ' and para[0] != '-' and para[0] != '*':
142 for line in wrap(para, BODY_WIDTH):
143 result += line + "\n"
144 result += "\n"
145 newlines = 2
146 else:
147 if not onlywhite(para):
148 result += para + "\n"
149 newlines = 1
150 else:
151 if newlines < 2:
152 result += "\n"
153 newlines += 1
154 return result
155
156 def hn(tag):
157 if tag[0] == 'h' and len(tag) == 2:
158 try:
159 n = int(tag[1])
160 if n in range(1, 10): return n
161 except ValueError: return 0
162
163 def dumb_property_dict(style):
164 """returns a hash of css attributes"""
165 return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
166
167 def dumb_css_parser(data):
168 """returns a hash of css selectors, each of which contains a hash of css attributes"""
169 # remove @import sentences
170 importIndex = data.find('@import')
171 while importIndex != -1:
172 data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
173 importIndex = data.find('@import')
174
175 # parse the css. reverted from dictionary compehension in order to support older pythons
176 elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
177 elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
178
179 return elements
180
181 def element_style(attrs, style_def, parent_style):
182 """returns a hash of the 'final' style attributes of the element"""
183 style = parent_style.copy()
184 if 'class' in attrs:
185 for css_class in attrs['class'].split():
186 css_style = style_def['.' + css_class]
187 style.update(css_style)
188 if 'style' in attrs:
189 immediate_style = dumb_property_dict(attrs['style'])
190 style.update(immediate_style)
191 return style
192
193 def google_list_style(style):
194 """finds out whether this is an ordered or unordered list"""
195 if 'list-style-type' in style:
196 list_style = style['list-style-type']
197 if list_style in ['disc', 'circle', 'square', 'none']:
198 return 'ul'
199 return 'ol'
200
201 def google_nest_count(style):
202 """calculate the nesting count of google doc lists"""
203 nest_count = 0
204 if 'margin-left' in style:
205 nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT
206 return nest_count
207
208 def google_has_height(style):
209 """check if the style of the element has the 'height' attribute explicitly defined"""
210 if 'height' in style:
211 return True
212 return False
213
214 def google_text_emphasis(style):
215 """return a list of all emphasis modifiers of the element"""
216 emphasis = []
217 if 'text-decoration' in style:
218 emphasis.append(style['text-decoration'])
219 if 'font-style' in style:
220 emphasis.append(style['font-style'])
221 if 'font-weight' in style:
222 emphasis.append(style['font-weight'])
223 return emphasis
224
225 def google_fixed_width_font(style):
226 """check if the css of the current element defines a fixed width font"""
227 font_family = ''
228 if 'font-family' in style:
229 font_family = style['font-family']
230 if 'Courier New' == font_family or 'Consolas' == font_family:
231 return True
232 return False
233
234 def list_numbering_start(attrs):
235 """extract numbering from list element attributes"""
236 if 'start' in attrs:
237 return int(attrs['start']) - 1
238 else:
239 return 0
240
241 class _html2text(HTMLParser.HTMLParser):
242 def __init__(self, out=None, baseurl=''):
243 HTMLParser.HTMLParser.__init__(self)
244
245 if out is None: self.out = self.outtextf
246 else: self.out = out
247 self.outtextlist = [] # empty list to store output characters before they are "joined"
248 try:
249 self.outtext = unicode()
250 except NameError: # Python3
251 self.outtext = str()
252 self.quiet = 0
253 self.p_p = 0 # number of newline character to print before next output
254 self.outcount = 0
255 self.start = 1
256 self.space = 0
257 self.a = []
258 self.astack = []
259 self.acount = 0
260 self.list = []
261 self.blockquote = 0
262 self.pre = 0
263 self.startpre = 0
264 self.code = False
265 self.br_toggle = ''
266 self.lastWasNL = 0
267 self.lastWasList = False
268 self.style = 0
269 self.style_def = {}
270 self.tag_stack = []
271 self.emphasis = 0
272 self.drop_white_space = 0
273 self.inheader = False
274 self.abbr_title = None # current abbreviation definition
275 self.abbr_data = None # last inner HTML (for abbr being defined)
276 self.abbr_list = {} # stack of abbreviations to write later
277 self.baseurl = baseurl
278
279 if options.google_doc:
280 del unifiable_n[name2cp('nbsp')]
281 unifiable['nbsp'] = '&nbsp_place_holder;'
282
283 def feed(self, data):
284 data = data.replace("</' + 'script>", "</ignore>")
285 HTMLParser.HTMLParser.feed(self, data)
286
287 def outtextf(self, s):
288 self.outtextlist.append(s)
289 if s: self.lastWasNL = s[-1] == '\n'
290
291 def close(self):
292 HTMLParser.HTMLParser.close(self)
293
294 self.pbr()
295 self.o('', 0, 'end')
296
297 self.outtext = self.outtext.join(self.outtextlist)
298
299 if options.google_doc:
300 self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');
301
302 return self.outtext
303
304 def handle_charref(self, c):
305 self.o(charref(c), 1)
306
307 def handle_entityref(self, c):
308 self.o(entityref(c), 1)
309
310 def handle_starttag(self, tag, attrs):
311 self.handle_tag(tag, attrs, 1)
312
313 def handle_endtag(self, tag):
314 self.handle_tag(tag, None, 0)
315
316 def previousIndex(self, attrs):
317 """ returns the index of certain set of attributes (of a link) in the
318 self.a list
319
320 If the set of attributes is not found, returns None
321 """
322 if not has_key(attrs, 'href'): return None
323
324 i = -1
325 for a in self.a:
326 i += 1
327 match = 0
328
329 if has_key(a, 'href') and a['href'] == attrs['href']:
330 if has_key(a, 'title') or has_key(attrs, 'title'):
331 if (has_key(a, 'title') and has_key(attrs, 'title') and
332 a['title'] == attrs['title']):
333 match = True
334 else:
335 match = True
336
337 if match: return i
338
339 def drop_last(self, nLetters):
340 if not self.quiet:
341 self.outtext = self.outtext[:-nLetters]
342
343 def handle_emphasis(self, start, tag_style, parent_style):
344 """handles various text emphases"""
345 tag_emphasis = google_text_emphasis(tag_style)
346 parent_emphasis = google_text_emphasis(parent_style)
347
348 # handle Google's text emphasis
349 strikethrough = 'line-through' in tag_emphasis and options.hide_strikethrough
350 bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
351 italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
352 fixed = google_fixed_width_font(tag_style) and not \
353 google_fixed_width_font(parent_style) and not self.pre
354
355 if start:
356 # crossed-out text must be handled before other attributes
357 # in order not to output qualifiers unnecessarily
358 if bold or italic or fixed:
359 self.emphasis += 1
360 if strikethrough:
361 self.quiet += 1
362 if italic:
363 self.o("_")
364 self.drop_white_space += 1
365 if bold:
366 self.o("**")
367 self.drop_white_space += 1
368 if fixed:
369 self.o('`')
370 self.drop_white_space += 1
371 self.code = True
372 else:
373 if bold or italic or fixed:
374 # there must not be whitespace before closing emphasis mark
375 self.emphasis -= 1
376 self.space = 0
377 self.outtext = self.outtext.rstrip()
378 if fixed:
379 if self.drop_white_space:
380 # empty emphasis, drop it
381 self.drop_last(1)
382 self.drop_white_space -= 1
383 else:
384 self.o('`')
385 self.code = False
386 if bold:
387 if self.drop_white_space:
388 # empty emphasis, drop it
389 self.drop_last(2)
390 self.drop_white_space -= 1
391 else:
392 self.o("**")
393 if italic:
394 if self.drop_white_space:
395 # empty emphasis, drop it
396 self.drop_last(1)
397 self.drop_white_space -= 1
398 else:
399 self.o("_")
400 # space is only allowed after *all* emphasis marks
401 if (bold or italic) and not self.emphasis:
402 self.o(" ")
403 if strikethrough:
404 self.quiet -= 1
405
406 def handle_tag(self, tag, attrs, start):
407 #attrs = fixattrs(attrs)
408 if attrs is None:
409 attrs = {}
410 else:
411 attrs = dict(attrs)
412
413 if options.google_doc:
414 # the attrs parameter is empty for a closing tag. in addition, we
415 # need the attributes of the parent nodes in order to get a
416 # complete style description for the current element. we assume
417 # that google docs export well formed html.
418 parent_style = {}
419 if start:
420 if self.tag_stack:
421 parent_style = self.tag_stack[-1][2]
422 tag_style = element_style(attrs, self.style_def, parent_style)
423 self.tag_stack.append((tag, attrs, tag_style))
424 else:
425 dummy, attrs, tag_style = self.tag_stack.pop()
426 if self.tag_stack:
427 parent_style = self.tag_stack[-1][2]
428
429 if hn(tag):
430 self.p()
431 if start:
432 self.inheader = True
433 self.o(hn(tag)*"#" + ' ')
434 else:
435 self.inheader = False
436 return # prevent redundant emphasis marks on headers
437
438 if tag in ['p', 'div']:
439 if options.google_doc:
440 if start and google_has_height(tag_style):
441 self.p()
442 else:
443 self.soft_br()
444 else:
445 self.p()
446
447 if tag == "br" and start: self.o(" \n")
448
449 if tag == "hr" and start:
450 self.p()
451 self.o("* * *")
452 self.p()
453
454 if tag in ["head", "style", 'script']:
455 if start: self.quiet += 1
456 else: self.quiet -= 1
457
458 if tag == "style":
459 if start: self.style += 1
460 else: self.style -= 1
461
462 if tag in ["body"]:
463 self.quiet = 0 # sites like 9rules.com never close <head>
464
465 if tag == "blockquote":
466 if start:
467 self.p(); self.o('> ', 0, 1); self.start = 1
468 self.blockquote += 1
469 else:
470 self.blockquote -= 1
471 self.p()
472
473 if tag in ['em', 'i', 'u']: self.o("_")
474 if tag in ['strong', 'b']: self.o("**")
475 if tag in ['del', 'strike']:
476 if start:
477 self.o("<"+tag+">")
478 else:
479 self.o("</"+tag+">")
480
481 if options.google_doc:
482 if not self.inheader:
483 # handle some font attributes, but leave headers clean
484 self.handle_emphasis(start, tag_style, parent_style)
485
486 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
487 if tag == "abbr":
488 if start:
489 self.abbr_title = None
490 self.abbr_data = ''
491 if has_key(attrs, 'title'):
492 self.abbr_title = attrs['title']
493 else:
494 if self.abbr_title != None:
495 self.abbr_list[self.abbr_data] = self.abbr_title
496 self.abbr_title = None
497 self.abbr_data = ''
498
499 if tag == "a" and not IGNORE_ANCHORS:
500 if start:
501 if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
502 self.astack.append(attrs)
503 self.o("[")
504 else:
505 self.astack.append(None)
506 else:
507 if self.astack:
508 a = self.astack.pop()
509 if a:
510 if INLINE_LINKS:
511 self.o("](" + a['href'] + ")")
512 else:
513 i = self.previousIndex(a)
514 if i is not None:
515 a = self.a[i]
516 else:
517 self.acount += 1
518 a['count'] = self.acount
519 a['outcount'] = self.outcount
520 self.a.append(a)
521 self.o("][" + str(a['count']) + "]")
522
523 if tag == "img" and start and not IGNORE_IMAGES:
524 if has_key(attrs, 'src'):
525 attrs['href'] = attrs['src']
526 alt = attrs.get('alt', '')
527 if INLINE_LINKS:
528 self.o("![")
529 self.o(alt)
530 self.o("]("+ attrs['href'] +")")
531 else:
532 i = self.previousIndex(attrs)
533 if i is not None:
534 attrs = self.a[i]
535 else:
536 self.acount += 1
537 attrs['count'] = self.acount
538 attrs['outcount'] = self.outcount
539 self.a.append(attrs)
540 self.o("![")
541 self.o(alt)
542 self.o("]["+ str(attrs['count']) +"]")
543
544 if tag == 'dl' and start: self.p()
545 if tag == 'dt' and not start: self.pbr()
546 if tag == 'dd' and start: self.o(' ')
547 if tag == 'dd' and not start: self.pbr()
548
549 if tag in ["ol", "ul"]:
550 # Google Docs create sub lists as top level lists
551 if (not self.list) and (not self.lastWasList):
552 self.p()
553 if start:
554 if options.google_doc:
555 list_style = google_list_style(tag_style)
556 else:
557 list_style = tag
558 numbering_start = list_numbering_start(attrs)
559 self.list.append({'name':list_style, 'num':numbering_start})
560 else:
561 if self.list: self.list.pop()
562 self.lastWasList = True
563 else:
564 self.lastWasList = False
565
566 if tag == 'li':
567 self.pbr()
568 if start:
569 if self.list: li = self.list[-1]
570 else: li = {'name':'ul', 'num':0}
571 if options.google_doc:
572 nest_count = google_nest_count(tag_style)
573 else:
574 nest_count = len(self.list)
575 self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
576 if li['name'] == "ul": self.o(options.ul_item_mark + " ")
577 elif li['name'] == "ol":
578 li['num'] += 1
579 self.o(str(li['num'])+". ")
580 self.start = 1
581
582 if tag in ["table", "tr"] and start: self.p()
583 if tag == 'td': self.pbr()
584
585 if tag == "pre":
586 if start:
587 self.startpre = 1
588 self.pre = 1
589 else:
590 self.pre = 0
591 self.p()
592
593 def pbr(self):
594 if self.p_p == 0: self.p_p = 1
595
596 def p(self): self.p_p = 2
597
598 def soft_br(self):
599 self.pbr()
600 self.br_toggle = ' '
601
602 def o(self, data, puredata=0, force=0):
603 if self.abbr_data is not None: self.abbr_data += data
604
605 if not self.quiet:
606 if options.google_doc:
607 # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
608 lstripped_data = data.lstrip()
609 if self.drop_white_space and not (self.pre or self.code):
610 data = lstripped_data
611 if lstripped_data != '':
612 self.drop_white_space = 0
613
614 if puredata and not self.pre:
615 data = re.sub('\s+', ' ', data)
616 if data and data[0] == ' ':
617 self.space = 1
618 data = data[1:]
619 if not data and not force: return
620
621 if self.startpre:
622 #self.out(" :") #TODO: not output when already one there
623 self.startpre = 0
624
625 bq = (">" * self.blockquote)
626 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
627
628 if self.pre:
629 bq += " "
630 data = data.replace("\n", "\n"+bq)
631
632 if self.start:
633 self.space = 0
634 self.p_p = 0
635 self.start = 0
636
637 if force == 'end':
638 # It's the end.
639 self.p_p = 0
640 self.out("\n")
641 self.space = 0
642
643 if self.p_p:
644 self.out((self.br_toggle+'\n'+bq)*self.p_p)
645 self.space = 0
646 self.br_toggle = ''
647
648 if self.space:
649 if not self.lastWasNL: self.out(' ')
650 self.space = 0
651
652 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
653 if force == "end": self.out("\n")
654
655 newa = []
656 for link in self.a:
657 if self.outcount > link['outcount']:
658 self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
659 if has_key(link, 'title'): self.out(" ("+link['title']+")")
660 self.out("\n")
661 else:
662 newa.append(link)
663
664 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
665
666 self.a = newa
667
668 if self.abbr_list and force == "end":
669 for abbr, definition in self.abbr_list.items():
670 self.out(" *[" + abbr + "]: " + definition + "\n")
671
672 self.p_p = 0
673 self.out(data)
674 self.outcount += 1
675
676 def handle_data(self, data):
677 if r'\/script>' in data: self.quiet -= 1
678
679 if self.style:
680 self.style_def.update(dumb_css_parser(data))
681
682 self.o(data, 1)
683
684 def unknown_decl(self, data): pass
685
686 def wrapwrite(text):
687 text = text.encode('utf-8')
688 try: #Python3
689 sys.stdout.buffer.write(text)
690 except AttributeError:
691 sys.stdout.write(text)
692
693 def html2text_file(html, out=wrapwrite, baseurl=''):
694 h = _html2text(out, baseurl)
695 h.feed(html)
696 h.feed("")
697 return h.close()
698
699 def html2text(html, baseurl=''):
700 return optwrap(html2text_file(html, None, baseurl))
701
702 class Storage: pass
703 options = Storage()
704 options.google_doc = False
705 options.ul_item_mark = '*'
706
707 if __name__ == "__main__":
708 baseurl = ''
709
710 p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
711 version='%prog ' + __version__)
712 p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
713 default=False, help="convert an html-exported Google Document")
714 p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
715 default=False, help="use a dash rather than a star for unordered list items")
716 p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
717 default=78, help="number of characters per output line, 0 for no wrap")
718 p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
719 default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
720 p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
721 default=False, help="hide strike-through text. only relevent when -g is specified as well")
722 (options, args) = p.parse_args()
723
724 # handle options
725 if options.ul_style_dash:
726 options.ul_item_mark = '-'
727 else:
728 options.ul_item_mark = '*'
729
730 BODY_WIDTH = options.body_width
731 GOOGLE_LIST_INDENT = options.list_indent
732
733 # process input
734 if len(args) > 0:
735 file_ = args[0]
736 encoding = None
737 if len(args) == 2:
738 encoding = args[1]
739 if len(args) > 2:
740 p.error('Too many arguments')
741
742 if file_.startswith('http://') or file_.startswith('https://'):
743 baseurl = file_
744 j = urllib.urlopen(baseurl)
745 text = j.read()
746 if encoding is None:
747 try:
748 from feedparser import _getCharacterEncoding as enc
749 except ImportError:
750 enc = lambda x, y: ('utf-8', 1)
751 encoding = enc(j.headers, text)[0]
752 if encoding == 'us-ascii':
753 encoding = 'utf-8'
754 data = text.decode(encoding)
755
756 else:
757 data = open(file_, 'rb').read()
758 if encoding is None:
759 try:
760 from chardet import detect
761 except ImportError:
762 detect = lambda x: {'encoding': 'utf-8'}
763 encoding = detect(data)['encoding']
764 data = data.decode(encoding)
765 else:
766 data = sys.stdin.read()
767 wrapwrite(html2text(data, baseurl))