gooderp18绿色标准版
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

926 lines
36KB

  1. # -*- coding: utf-8 -*-
  2. # Part of Odoo. See LICENSE file for full copyright and licensing details.
  3. import base64
  4. import collections
  5. import itertools
  6. import logging
  7. import random
  8. import re
  9. import socket
  10. import time
  11. import email.utils
  12. from email.utils import getaddresses as orig_getaddresses
  13. from urllib.parse import urlparse
  14. import html as htmllib
  15. import idna
  16. import markupsafe
  17. from lxml import etree, html
  18. from lxml.html import clean, defs
  19. from werkzeug import urls
  20. from odoo.tools import misc
  21. __all__ = [
  22. "email_domain_extract",
  23. "email_domain_normalize",
  24. "email_normalize",
  25. "email_normalize_all",
  26. "email_split",
  27. "encapsulate_email",
  28. "formataddr",
  29. "html2plaintext",
  30. "html_normalize",
  31. "html_sanitize",
  32. "is_html_empty",
  33. "parse_contact_from_email",
  34. "plaintext2html",
  35. "single_email_re",
  36. ]
  37. _logger = logging.getLogger(__name__)
  38. # disable strict mode when present: we rely on original non-strict
  39. # parsing, and we know that it isn't reliable, that ok.
  40. # cfr python/cpython@4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19
  41. if hasattr(email.utils, 'supports_strict_parsing'):
  42. def getaddresses(fieldvalues):
  43. return orig_getaddresses(fieldvalues, strict=False)
  44. else:
  45. getaddresses = orig_getaddresses
  46. #----------------------------------------------------------
  47. # HTML Sanitizer
  48. #----------------------------------------------------------
  49. safe_attrs = defs.safe_attrs | frozenset(
  50. ['style',
  51. 'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection
  52. 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid',
  53. 'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version',
  54. 'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', # legacy editor
  55. 'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
  56. 'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
  57. 'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
  58. 'data-mimetype-before-conversion',
  59. ])
  60. SANITIZE_TAGS = {
  61. # allow new semantic HTML5 tags
  62. 'allow_tags': defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]),
  63. 'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta',
  64. 'noscript', 'object', 'script', 'style', 'title'],
  65. 'remove_tags': ['html', 'body'],
  66. }
  67. class _Cleaner(clean.Cleaner):
  68. _style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''')
  69. _style_whitelist = [
  70. 'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',
  71. 'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',
  72. 'float', 'vertical-align', 'display',
  73. 'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',
  74. 'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',
  75. 'white-space',
  76. # box model
  77. 'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',
  78. 'height', 'width', 'max-width', 'min-width', 'min-height',
  79. # tables
  80. 'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout']
  81. _style_whitelist.extend(
  82. ['border-%s-%s' % (position, attribute)
  83. for position in ['top', 'bottom', 'left', 'right']
  84. for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')]
  85. )
  86. strip_classes = False
  87. sanitize_style = False
  88. conditional_comments = True
  89. def __call__(self, doc):
  90. super(_Cleaner, self).__call__(doc)
  91. # if we keep attributes but still remove classes
  92. if not getattr(self, 'safe_attrs_only', False) and self.strip_classes:
  93. for el in doc.iter(tag=etree.Element):
  94. self.strip_class(el)
  95. # if we keep style attribute, sanitize them
  96. if not self.style and self.sanitize_style:
  97. for el in doc.iter(tag=etree.Element):
  98. self.parse_style(el)
  99. def strip_class(self, el):
  100. if el.attrib.get('class'):
  101. del el.attrib['class']
  102. def parse_style(self, el):
  103. attributes = el.attrib
  104. styling = attributes.get('style')
  105. if styling:
  106. valid_styles = collections.OrderedDict()
  107. styles = self._style_re.findall(styling)
  108. for style in styles:
  109. if style[0].lower() in self._style_whitelist:
  110. valid_styles[style[0].lower()] = style[1]
  111. if valid_styles:
  112. el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items())
  113. else:
  114. del el.attrib['style']
  115. def kill_conditional_comments(self, doc):
  116. """Override the default behavior of lxml.
  117. https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510
  118. In some use cases, e.g. templates used for mass mailing,
  119. we send emails containing conditional comments targeting Microsoft Outlook,
  120. to give special styling instructions.
  121. https://github.com/odoo/odoo/pull/119325/files#r1301064789
  122. Within these conditional comments, unsanitized HTML can lie.
  123. However, in modern browser, these comments are considered as simple comments,
  124. their content is not executed.
  125. https://caniuse.com/sr_ie-features
  126. """
  127. if self.conditional_comments:
  128. super().kill_conditional_comments(doc)
  129. def tag_quote(el):
  130. def _create_new_node(tag, text, tail=None, attrs=None):
  131. new_node = etree.Element(tag)
  132. new_node.text = text
  133. new_node.tail = tail
  134. if attrs:
  135. for key, val in attrs.items():
  136. new_node.set(key, val)
  137. return new_node
  138. def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None):
  139. text = node.text or ''
  140. if not re.search(regex, text):
  141. return
  142. child_node = None
  143. idx, node_idx = 0, 0
  144. for item in re.finditer(regex, text):
  145. new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs)
  146. if child_node is None:
  147. node.text = text[idx:item.start()]
  148. new_node.tail = text[item.end():]
  149. node.insert(node_idx, new_node)
  150. else:
  151. child_node.tail = text[idx:item.start()]
  152. new_node.tail = text[item.end():]
  153. node.insert(node_idx, new_node)
  154. child_node = new_node
  155. idx = item.end()
  156. node_idx = node_idx + 1
  157. el_class = el.get('class', '') or ''
  158. el_id = el.get('id', '') or ''
  159. # gmail or yahoo // # outlook, html // # msoffice
  160. if 'gmail_extra' in el_class or \
  161. ('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class):
  162. el.set('data-o-mail-quote', '1')
  163. if el.getparent() is not None:
  164. el.getparent().set('data-o-mail-quote-container', '1')
  165. if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \
  166. 'yahoo_quoted' in el_class:
  167. # Quote all elements after this one
  168. el.set('data-o-mail-quote', '1')
  169. for sibling in el.itersiblings(preceding=False):
  170. sibling.set('data-o-mail-quote', '1')
  171. # odoo, gmail and outlook automatic signature wrapper
  172. is_signature_wrapper = 'odoo_signature_wrapper' in el_class or 'gmail_signature' in el_class or el_id == "Signature"
  173. is_outlook_auto_message = 'appendonsend' in el_id
  174. # gmail and outlook reply quote
  175. is_outlook_reply_quote = 'divRplyFwdMsg' in el_id
  176. is_gmail_quote = 'gmail_quote' in el_class
  177. is_quote_wrapper = is_signature_wrapper or is_gmail_quote or is_outlook_reply_quote
  178. if is_quote_wrapper:
  179. el.set('data-o-mail-quote-container', '1')
  180. el.set('data-o-mail-quote', '1')
  181. # outlook reply wrapper is preceded with <hr> and a div containing recipient info
  182. if is_outlook_reply_quote:
  183. hr = el.getprevious()
  184. reply_quote = el.getnext()
  185. if hr is not None and hr.tag == 'hr':
  186. hr.set('data-o-mail-quote', '1')
  187. if reply_quote is not None:
  188. reply_quote.set('data-o-mail-quote-container', '1')
  189. reply_quote.set('data-o-mail-quote', '1')
  190. if is_outlook_auto_message:
  191. if not el.text or not el.text.strip():
  192. el.set('data-o-mail-quote-container', '1')
  193. el.set('data-o-mail-quote', '1')
  194. # html signature (-- <br />blah)
  195. signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))")
  196. if el.text and el.find('br') is not None and re.search(signature_begin, el.text):
  197. el.set('data-o-mail-quote', '1')
  198. if el.getparent() is not None:
  199. el.getparent().set('data-o-mail-quote-container', '1')
  200. # text-based quotes (>, >>) and signatures (-- Signature)
  201. text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))")
  202. if not el.get('data-o-mail-quote'):
  203. _tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'})
  204. if el.tag == 'blockquote':
  205. # remove single node
  206. el.set('data-o-mail-quote-node', '1')
  207. el.set('data-o-mail-quote', '1')
  208. if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'):
  209. if el.getparent().get('data-o-mail-quote'):
  210. el.set('data-o-mail-quote', '1')
  211. # only quoting the elements following the first quote in the container
  212. # avoids issues with repeated calls to html_normalize
  213. elif el.getparent().get('data-o-mail-quote-container'):
  214. if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None:
  215. siblings = el.getparent().getchildren()
  216. quote_index = siblings.index(first_sibling_quote)
  217. element_index = siblings.index(el)
  218. if quote_index < element_index:
  219. el.set('data-o-mail-quote', '1')
  220. if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip():
  221. el.set('data-o-mail-quote', '1')
  222. def html_normalize(src, filter_callback=None, output_method="html"):
  223. """ Normalize `src` for storage as an html field value.
  224. The string is parsed as an html tag soup, made valid, then decorated for
  225. "email quote" detection, and prepared for an optional filtering.
  226. The filtering step (e.g. sanitization) should be performed by the
  227. `filter_callback` function (to avoid multiple parsing operations, and
  228. normalize the result).
  229. :param src: the html string to normalize
  230. :param filter_callback: optional callable taking a single `etree._Element`
  231. document parameter, to be called during normalization in order to
  232. filter the output document
  233. :param output_method: defines the output method to pass to `html.tostring`.
  234. It defaults to 'html', but can also be 'xml' for xhtml output.
  235. """
  236. if not src:
  237. return src
  238. # html: remove encoding attribute inside tags
  239. src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src, flags=re.IGNORECASE | re.DOTALL)
  240. src = src.replace('--!>', '-->')
  241. src = re.sub(r'(<!-->|<!--->)', '<!-- -->', src)
  242. # On the specific case of Outlook desktop it adds unnecessary '<o:.*></o:.*>' tags which are parsed
  243. # in '<p></p>' which may alter the appearance (eg. spacing) of the mail body
  244. src = re.sub(r'</?o:.*?>', '', src)
  245. try:
  246. doc = html.fromstring(src)
  247. except etree.ParserError as e:
  248. # HTML comment only string, whitespace only..
  249. if 'empty' in str(e):
  250. return ""
  251. raise
  252. # perform quote detection before cleaning and class removal
  253. if doc is not None:
  254. for el in doc.iter(tag=etree.Element):
  255. tag_quote(el)
  256. if filter_callback:
  257. doc = filter_callback(doc)
  258. src = html.tostring(doc, encoding='unicode', method=output_method)
  259. # this is ugly, but lxml/etree tostring want to put everything in a
  260. # 'div' that breaks the editor -> remove that
  261. if src.startswith('<div>') and src.endswith('</div>'):
  262. src = src[5:-6]
  263. # html considerations so real html content match database value
  264. src = src.replace(u'\xa0', u'&nbsp;')
  265. return src
  266. def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"):
  267. if not src:
  268. return src
  269. logger = logging.getLogger(__name__ + '.html_sanitize')
  270. def sanitize_handler(doc):
  271. kwargs = {
  272. 'page_structure': True,
  273. 'style': strip_style, # True = remove style tags/attrs
  274. 'sanitize_style': sanitize_style, # True = sanitize styling
  275. 'forms': sanitize_form, # True = remove form tags
  276. 'remove_unknown_tags': False,
  277. 'comments': False,
  278. 'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments
  279. 'processing_instructions': False
  280. }
  281. if sanitize_tags:
  282. kwargs.update(SANITIZE_TAGS)
  283. if sanitize_attributes: # We keep all attributes in order to keep "style"
  284. if strip_classes:
  285. current_safe_attrs = safe_attrs - frozenset(['class'])
  286. else:
  287. current_safe_attrs = safe_attrs
  288. kwargs.update({
  289. 'safe_attrs_only': True,
  290. 'safe_attrs': current_safe_attrs,
  291. })
  292. else:
  293. kwargs.update({
  294. 'safe_attrs_only': False, # keep oe-data attributes + style
  295. 'strip_classes': strip_classes, # remove classes, even when keeping other attributes
  296. })
  297. cleaner = _Cleaner(**kwargs)
  298. cleaner(doc)
  299. return doc
  300. try:
  301. sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method)
  302. except etree.ParserError:
  303. if not silent:
  304. raise
  305. logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True)
  306. sanitized = '<p>ParserError when sanitizing</p>'
  307. except Exception:
  308. if not silent:
  309. raise
  310. logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True)
  311. sanitized = '<p>Unknown error when sanitizing</p>'
  312. return markupsafe.Markup(sanitized)
  313. # ----------------------------------------------------------
  314. # HTML/Text management
  315. # ----------------------------------------------------------
  316. URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:'
  317. URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])'''
  318. TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?'
  319. # retrieve inner content of the link
  320. HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?'
  321. HTML_TAGS_REGEX = re.compile('<.*?>')
  322. HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n')
  323. def validate_url(url):
  324. if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'):
  325. return 'http://' + url
  326. return url
  327. def is_html_empty(html_content):
  328. """Check if a html content is empty. If there are only formatting tags with style
  329. attributes or a void content return True. Famous use case if a
  330. '<p style="..."><br></p>' added by some web editor.
  331. :param str html_content: html content, coming from example from an HTML field
  332. :returns: bool, True if no content found or if containing only void formatting tags
  333. """
  334. if not html_content:
  335. return True
  336. icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'
  337. tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'
  338. return not bool(re.sub(tag_re, '', html_content).strip()) and not re.search(icon_re, html_content)
  339. def html_keep_url(text):
  340. """ Transform the url into clickable link with <a/> tag """
  341. idx = 0
  342. final = ''
  343. link_tags = re.compile(r"""(?<!["'])((ftp|http|https):\/\/(\w+:{0,1}\w*@)?([^\s<"']+)(:[0-9]+)?(\/|\/([^\s<"']))?)(?![^\s<"']*["']|[^\s<"']*</a>)""")
  344. for item in re.finditer(link_tags, text):
  345. final += text[idx:item.start()]
  346. final += create_link(item.group(0), item.group(0))
  347. idx = item.end()
  348. final += text[idx:]
  349. return final
  350. def html_to_inner_content(html):
  351. """Returns unformatted text after removing html tags and excessive whitespace from a
  352. string/Markup. Passed strings will first be sanitized.
  353. """
  354. if is_html_empty(html):
  355. return ''
  356. if not isinstance(html, markupsafe.Markup):
  357. html = html_sanitize(html)
  358. processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)
  359. processed = re.sub(HTML_TAGS_REGEX, '', processed)
  360. processed = re.sub(r' {2,}|\t', ' ', processed)
  361. processed = htmllib.unescape(processed)
  362. processed = processed.strip()
  363. return processed
  364. def create_link(url, label):
  365. return f'<a href="{url}" target="_blank" rel="noreferrer noopener">{label}</a>'
  366. def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True):
  367. """ From an HTML text, convert the HTML to plain text.
  368. If @param body_id is provided then this is the tag where the
  369. body (not necessarily <body>) starts.
  370. :param include_references: If False, numbered references and
  371. URLs for links and images will not be included.
  372. """
  373. ## (c) Fry-IT, www.fry-it.com, 2007
  374. ## <peter@fry-it.com>
  375. ## download here: http://www.peterbe.com/plog/html2plaintext
  376. if not (html and html.strip()):
  377. return ''
  378. if isinstance(html, bytes):
  379. html = html.decode(encoding)
  380. else:
  381. assert isinstance(html, str), f"expected str got {html.__class__.__name__}"
  382. tree = etree.fromstring(html, parser=etree.HTMLParser())
  383. if body_id is not None:
  384. source = tree.xpath('//*[@id=%s]' % (body_id,))
  385. else:
  386. source = tree.xpath('//body')
  387. if len(source):
  388. tree = source[0]
  389. url_index = []
  390. linkrefs = itertools.count(1)
  391. if include_references:
  392. for link in tree.findall('.//a'):
  393. if url := link.get('href'):
  394. link.tag = 'span'
  395. link.text = f'{link.text} [{next(linkrefs)}]'
  396. url_index.append(url)
  397. for img in tree.findall('.//img'):
  398. if src := img.get('src'):
  399. img.tag = 'span'
  400. img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)
  401. img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs))
  402. url_index.append(src)
  403. html = etree.tostring(tree, encoding="unicode")
  404. # \r char is converted into &#13;, must remove it
  405. html = html.replace('&#13;', '')
  406. html = html.replace('<strong>', '*').replace('</strong>', '*')
  407. html = html.replace('<b>', '*').replace('</b>', '*')
  408. html = html.replace('<h3>', '*').replace('</h3>', '*')
  409. html = html.replace('<h2>', '**').replace('</h2>', '**')
  410. html = html.replace('<h1>', '**').replace('</h1>', '**')
  411. html = html.replace('<em>', '/').replace('</em>', '/')
  412. html = html.replace('<tr>', '\n')
  413. html = html.replace('</p>', '\n')
  414. html = re.sub(r'<br\s*/?>', '\n', html)
  415. html = re.sub('<.*?>', ' ', html)
  416. html = html.replace(' ' * 2, ' ')
  417. html = html.replace('&gt;', '>')
  418. html = html.replace('&lt;', '<')
  419. html = html.replace('&amp;', '&')
  420. html = html.replace('&nbsp;', '\N{NO-BREAK SPACE}')
  421. # strip all lines
  422. html = '\n'.join([x.strip() for x in html.splitlines()])
  423. html = html.replace('\n' * 2, '\n')
  424. if url_index:
  425. html += '\n\n'
  426. for i, url in enumerate(url_index, start=1):
  427. html += f'[{i}] {url}\n'
  428. return html.strip()
  429. def plaintext2html(text, container_tag=None):
  430. r"""Convert plaintext into html. Content of the text is escaped to manage
  431. html entities, using :func:`~odoo.tools.misc.html_escape`.
  432. - all ``\n``, ``\r`` are replaced by ``<br/>``
  433. - enclose content into ``<p>``
  434. - convert url into clickable link
  435. - 2 or more consecutive ``<br/>`` are considered as paragraph breaks
  436. :param str text: plaintext to convert
  437. :param str container_tag: container of the html; by default the content is
  438. embedded into a ``<div>``
  439. :rtype: markupsafe.Markup
  440. """
  441. assert isinstance(text, str)
  442. text = misc.html_escape(text)
  443. # 1. replace \n and \r
  444. text = re.sub(r'(\r\n|\r|\n)', '<br/>', text)
  445. # 2. clickable links
  446. text = html_keep_url(text)
  447. # 3-4: form paragraphs
  448. idx = 0
  449. final = '<p>'
  450. br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
  451. for item in re.finditer(br_tags, text):
  452. final += text[idx:item.start()] + '</p><p>'
  453. idx = item.end()
  454. final += text[idx:] + '</p>'
  455. # 5. container
  456. if container_tag: # FIXME: validate that container_tag is just a simple tag?
  457. final = '<%s>%s</%s>' % (container_tag, final, container_tag)
  458. return markupsafe.Markup(final)
  459. def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None):
  460. """ Append extra content at the end of an HTML snippet, trying
  461. to locate the end of the HTML document (</body>, </html>, or
  462. EOF), and converting the provided content in html unless ``plaintext``
  463. is ``False``.
  464. Content conversion can be done in two ways:
  465. - wrapping it into a pre (``preserve=True``)
  466. - use plaintext2html (``preserve=False``, using ``container_tag`` to
  467. wrap the whole content)
  468. A side-effect of this method is to coerce all HTML tags to
  469. lowercase in ``html``, and strip enclosing <html> or <body> tags in
  470. content if ``plaintext`` is False.
  471. :param str html: html tagsoup (doesn't have to be XHTML)
  472. :param str content: extra content to append
  473. :param bool plaintext: whether content is plaintext and should
  474. be wrapped in a <pre/> tag.
  475. :param bool preserve: if content is plaintext, wrap it into a <pre>
  476. instead of converting it into html
  477. :param str container_tag: tag to wrap the content into, defaults to `div`.
  478. :rtype: markupsafe.Markup
  479. """
  480. if plaintext and preserve:
  481. content = '\n<pre>%s</pre>\n' % misc.html_escape(content)
  482. elif plaintext:
  483. content = '\n%s\n' % plaintext2html(content, container_tag)
  484. else:
  485. content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
  486. content = '\n%s\n' % content
  487. # Force all tags to lowercase
  488. html = re.sub(r'(</?)(\w+)([ >])',
  489. lambda m: '%s%s%s' % (m[1], m[2].lower(), m[3]), html)
  490. insert_location = html.find('</body>')
  491. if insert_location == -1:
  492. insert_location = html.find('</html>')
  493. if insert_location == -1:
  494. return markupsafe.Markup('%s%s' % (html, content))
  495. return markupsafe.Markup('%s%s%s' % (html[:insert_location], content, html[insert_location:]))
  496. def prepend_html_content(html_body, html_content):
  497. """Prepend some HTML content at the beginning of an other HTML content."""
  498. replacement = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', html_content)
  499. html_content = markupsafe.Markup(replacement) if isinstance(html_content, markupsafe.Markup) else replacement
  500. html_content = html_content.strip()
  501. body_match = re.search(r'<body[^>]*>', html_body) or re.search(r'<html[^>]*>', html_body)
  502. insert_index = body_match.end() if body_match else 0
  503. return html_body[:insert_index] + html_content + html_body[insert_index:]
  504. #----------------------------------------------------------
  505. # Emails
  506. #----------------------------------------------------------
  507. # matches any email in a body of text
  508. email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63})""", re.VERBOSE)
  509. # matches a string containing only one email
  510. single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}$""", re.VERBOSE)
  511. mail_header_msgid_re = re.compile('<[^<>]+>')
  512. email_addr_escapes_re = re.compile(r'[\\"]')
  513. def generate_tracking_message_id(res_id):
  514. """Returns a string that can be used in the Message-ID RFC822 header field
  515. Used to track the replies related to a given object thanks to the "In-Reply-To"
  516. or "References" fields that Mail User Agents will set.
  517. """
  518. try:
  519. rnd = random.SystemRandom().random()
  520. except NotImplementedError:
  521. rnd = random.random()
  522. rndstr = ("%.15f" % rnd)[2:]
  523. return "<%s.%.15f-openerp-%s@%s>" % (rndstr, time.time(), res_id, socket.gethostname())
  524. def email_split_tuples(text):
  525. """ Return a list of (name, email) address tuples found in ``text`` . Note
  526. that text should be an email header or a stringified email list as it may
  527. give broader results than expected on actual text. """
  528. def _parse_based_on_spaces(pair):
  529. """ With input 'name email@domain.com' (missing quotes for a formatting)
  530. getaddresses returns ('', 'name email@domain.com). This when having no
  531. name and an email a fallback to enhance parsing is to redo a getaddresses
  532. by replacing spaces by commas. The new email will be split into sub pairs
  533. allowing to find the email and name parts, allowing to make a new name /
  534. email pair. Emails should not contain spaces thus this is coherent with
  535. email formation. """
  536. name, email = pair
  537. if not name and email and ' ' in email:
  538. inside_pairs = getaddresses([email.replace(' ', ',')])
  539. name_parts, found_email = [], False
  540. for pair in inside_pairs:
  541. if pair[1] and '@' not in pair[1]:
  542. name_parts.append(pair[1])
  543. if pair[1] and '@' in pair[1]:
  544. found_email = pair[1]
  545. name, email = (' '.join(name_parts), found_email) if found_email else (name, email)
  546. return (name, email)
  547. if not text:
  548. return []
  549. # found valid pairs, filtering out failed parsing
  550. valid_pairs = [
  551. (addr[0], addr[1]) for addr in getaddresses([text])
  552. # getaddresses() returns '' when email parsing fails, and
  553. # sometimes returns emails without at least '@'. The '@'
  554. # is strictly required in RFC2822's `addr-spec`.
  555. if addr[1] and '@' in addr[1]
  556. ]
  557. # corner case: returning '@gmail.com'-like email (see test_email_split)
  558. if any(pair[1].startswith('@') for pair in valid_pairs):
  559. filtered = [
  560. found_email for found_email in email_re.findall(text)
  561. if found_email and not found_email.startswith('@')
  562. ]
  563. if filtered:
  564. valid_pairs = [('', found_email) for found_email in filtered]
  565. return list(map(_parse_based_on_spaces, valid_pairs))
  566. def email_split(text):
  567. """ Return a list of the email addresses found in ``text`` """
  568. if not text:
  569. return []
  570. return [email for (name, email) in email_split_tuples(text)]
  571. def email_split_and_format(text):
  572. """ Return a list of email addresses found in ``text``, formatted using
  573. formataddr. """
  574. if not text:
  575. return []
  576. return [formataddr((name, email)) for (name, email) in email_split_tuples(text)]
  577. def email_split_and_format_normalize(text):
  578. """ Same as 'email_split_and_format' but normalizing email. """
  579. return [
  580. formataddr(
  581. (name, _normalize_email(email))
  582. ) for (name, email) in email_split_tuples(text)
  583. ]
  584. def email_normalize(text, strict=True):
  585. """ Sanitize and standardize email address entries. As of rfc5322 section
  586. 3.4.1 local-part is case-sensitive. However most main providers do consider
  587. the local-part as case insensitive. With the introduction of smtp-utf8
  588. within odoo, this assumption is certain to fall short for international
  589. emails. We now consider that
  590. * if local part is ascii: normalize still 'lower' ;
  591. * else: use as it, SMTP-UF8 is made for non-ascii local parts;
  592. Concerning domain part of the address, as of v14 international domain (IDNA)
  593. are handled fine. The domain is always lowercase, lowering it is fine as it
  594. is probably an error. With the introduction of IDNA, there is an encoding
  595. that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.
  596. A normalized email is considered as :
  597. - having a left part + @ + a right part (the domain can be without '.something')
  598. - having no name before the address. Typically, having no 'Name <>'
  599. Ex:
  600. - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'
  601. - Normalized Output Email : 'name@domain.com'
  602. :param boolean strict: if True, text should contain a single email
  603. (default behavior in stable 14+). If more than one email is found no
  604. normalized email is returned. If False the first found candidate is used
  605. e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com>', result is either
  606. False (strict=True), either 'tony@e.com' (strict=False).
  607. :return: False if no email found (or if more than 1 email found when being
  608. in strict mode); normalized email otherwise;
  609. """
  610. emails = email_split(text)
  611. if not emails or (strict and len(emails) != 1):
  612. return False
  613. return _normalize_email(emails[0])
  614. def email_normalize_all(text):
  615. """ Tool method allowing to extract email addresses from a text input and returning
  616. normalized version of all found emails. If no email is found, a void list
  617. is returned.
  618. e.g. if email is 'tony@e.com, "Tony2" <tony2@e.com' returned result is ['tony@e.com, tony2@e.com']
  619. :return list: list of normalized emails found in text
  620. """
  621. if not text:
  622. return []
  623. emails = email_split(text)
  624. return list(filter(None, [_normalize_email(email) for email in emails]))
  625. def _normalize_email(email):
  626. """ As of rfc5322 section 3.4.1 local-part is case-sensitive. However most
  627. main providers do consider the local-part as case insensitive. With the
  628. introduction of smtp-utf8 within odoo, this assumption is certain to fall
  629. short for international emails. We now consider that
  630. * if local part is ascii: normalize still 'lower' ;
  631. * else: use as it, SMTP-UF8 is made for non-ascii local parts;
  632. Concerning domain part of the address, as of v14 international domain (IDNA)
  633. are handled fine. The domain is always lowercase, lowering it is fine as it
  634. is probably an error. With the introduction of IDNA, there is an encoding
  635. that allow non-ascii characters to be encoded to ascii ones, using 'idna.encode'.
  636. A normalized email is considered as :
  637. - having a left part + @ + a right part (the domain can be without '.something')
  638. - having no name before the address. Typically, having no 'Name <>'
  639. Ex:
  640. - Possible Input Email : 'Name <NaMe@DoMaIn.CoM>'
  641. - Normalized Output Email : 'name@domain.com'
  642. """
  643. local_part, at, domain = email.rpartition('@')
  644. try:
  645. local_part.encode('ascii')
  646. except UnicodeEncodeError:
  647. pass
  648. else:
  649. local_part = local_part.lower()
  650. return local_part + at + domain.lower()
  651. def email_domain_extract(email):
  652. """ Extract the company domain to be used by IAP services notably. Domain
  653. is extracted from email information e.g:
  654. - info@proximus.be -> proximus.be
  655. """
  656. normalized_email = email_normalize(email)
  657. if normalized_email:
  658. return normalized_email.split('@')[1]
  659. return False
  660. def email_domain_normalize(domain):
  661. """Return the domain normalized or False if the domain is invalid."""
  662. if not domain or '@' in domain:
  663. return False
  664. return domain.lower()
  665. def url_domain_extract(url):
  666. """ Extract the company domain to be used by IAP services notably. Domain
  667. is extracted from an URL e.g:
  668. - www.info.proximus.be -> proximus.be
  669. """
  670. parser_results = urlparse(url)
  671. company_hostname = parser_results.hostname
  672. if company_hostname and '.' in company_hostname:
  673. return '.'.join(company_hostname.split('.')[-2:]) # remove subdomains
  674. return False
  675. def email_escape_char(email_address):
  676. """ Escape problematic characters in the given email address string"""
  677. return email_address.replace('\\', '\\\\').replace('%', '\\%').replace('_', '\\_')
  678. # was mail_thread.decode_header()
  679. def decode_message_header(message, header, separator=' '):
  680. return separator.join(h for h in message.get_all(header, []) if h)
  681. def formataddr(pair, charset='utf-8'):
  682. """Pretty format a 2-tuple of the form (realname, email_address).
  683. If the first element of pair is falsy then only the email address
  684. is returned.
  685. Set the charset to ascii to get a RFC-2822 compliant email. The
  686. realname will be base64 encoded (if necessary) and the domain part
  687. of the email will be punycode encoded (if necessary). The local part
  688. is left unchanged thus require the SMTPUTF8 extension when there are
  689. non-ascii characters.
  690. >>> formataddr(('John Doe', 'johndoe@example.com'))
  691. '"John Doe" <johndoe@example.com>'
  692. >>> formataddr(('', 'johndoe@example.com'))
  693. 'johndoe@example.com'
  694. """
  695. name, address = pair
  696. local, _, domain = address.rpartition('@')
  697. try:
  698. domain.encode(charset)
  699. except UnicodeEncodeError:
  700. # rfc5890 - Internationalized Domain Names for Applications (IDNA)
  701. domain = idna.encode(domain).decode('ascii')
  702. if name:
  703. try:
  704. name.encode(charset)
  705. except UnicodeEncodeError:
  706. # charset mismatch, encode as utf-8/base64
  707. # rfc2047 - MIME Message Header Extensions for Non-ASCII Text
  708. name = base64.b64encode(name.encode('utf-8')).decode('ascii')
  709. return f"=?utf-8?b?{name}?= <{local}@{domain}>"
  710. else:
  711. # ascii name, escape it if needed
  712. # rfc2822 - Internet Message Format
  713. # #section-3.4 - Address Specification
  714. name = email_addr_escapes_re.sub(r'\\\g<0>', name)
  715. return f'"{name}" <{local}@{domain}>'
  716. return f"{local}@{domain}"
  717. def encapsulate_email(old_email, new_email):
  718. """Change the FROM of the message and use the old one as name.
  719. e.g.
  720. * Old From: "Admin" <admin@gmail.com>
  721. * New From: notifications@odoo.com
  722. * Output: "Admin" <notifications@odoo.com>
  723. """
  724. old_email_split = getaddresses([old_email])
  725. if not old_email_split or not old_email_split[0]:
  726. return old_email
  727. new_email_split = getaddresses([new_email])
  728. if not new_email_split or not new_email_split[0]:
  729. return
  730. old_name, old_email = old_email_split[0]
  731. if old_name:
  732. name_part = old_name
  733. else:
  734. name_part = old_email.split("@")[0]
  735. return formataddr((
  736. name_part,
  737. new_email_split[0][1],
  738. ))
  739. def parse_contact_from_email(text):
  740. """ Parse contact name and email (given by text) in order to find contact
  741. information, able to populate records like partners, leads, ...
  742. Supported syntax:
  743. * Raoul <raoul@grosbedon.fr>
  744. * "Raoul le Grand" <raoul@grosbedon.fr>
  745. * Raoul raoul@grosbedon.fr (strange fault tolerant support from
  746. df40926d2a57c101a3e2d221ecfd08fbb4fea30e now supported directly
  747. in 'email_split_tuples';
  748. Otherwise: default, text is set as name.
  749. :return: name, email (normalized if possible)
  750. """
  751. if not text or not text.strip():
  752. return '', ''
  753. split_results = email_split_tuples(text)
  754. name, email = split_results[0] if split_results else ('', '')
  755. if email:
  756. email_normalized = email_normalize(email, strict=False) or email
  757. else:
  758. name, email_normalized = text, ''
  759. return name, email_normalized
  760. def unfold_references(msg_references):
  761. """ As it declared in [RFC2822] long header bodies can be "folded" using
  762. CRLF+WSP. Some mail clients split References header body which contains
  763. Message Ids by "\n ".
  764. RFC2882: https://tools.ietf.org/html/rfc2822#section-2.2.3 """
  765. return [
  766. re.sub(r'[\r\n\t ]+', r'', ref) # "Unfold" buggy references
  767. for ref in mail_header_msgid_re.findall(msg_references)
  768. ]
上海开阖软件有限公司 沪ICP备12045867号-1