本站源代码
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

536 linhas
14KB

  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "io"
  33. "net/url"
  34. "strings"
  35. "golang.org/x/net/html"
  36. )
  37. // Sanitize takes a string that contains a HTML fragment or document and applies
  38. // the given policy whitelist.
  39. //
  40. // It returns a HTML string that has been sanitized by the policy or an empty
  41. // string if an error has occurred (most likely as a consequence of extremely
  42. // malformed input)
  43. func (p *Policy) Sanitize(s string) string {
  44. if strings.TrimSpace(s) == "" {
  45. return s
  46. }
  47. return p.sanitize(strings.NewReader(s)).String()
  48. }
  49. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  50. // the given policy whitelist.
  51. //
  52. // It returns a []byte containing the HTML that has been sanitized by the policy
  53. // or an empty []byte if an error has occurred (most likely as a consequence of
  54. // extremely malformed input)
  55. func (p *Policy) SanitizeBytes(b []byte) []byte {
  56. if len(bytes.TrimSpace(b)) == 0 {
  57. return b
  58. }
  59. return p.sanitize(bytes.NewReader(b)).Bytes()
  60. }
  61. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  62. // and applies the given policy whitelist.
  63. //
  64. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  65. // policy. Errors during sanitization will merely return an empty result.
  66. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  67. return p.sanitize(r)
  68. }
  69. // Performs the actual sanitization process.
  70. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
  71. // It is possible that the developer has created the policy via:
  72. // p := bluemonday.Policy{}
  73. // rather than:
  74. // p := bluemonday.NewPolicy()
  75. // If this is the case, and if they haven't yet triggered an action that
  76. // would initiliaze the maps, then we need to do that.
  77. p.init()
  78. var (
  79. buff bytes.Buffer
  80. skipElementContent bool
  81. skippingElementsCount int64
  82. skipClosingTag bool
  83. closingTagToSkipStack []string
  84. mostRecentlyStartedToken string
  85. )
  86. tokenizer := html.NewTokenizer(r)
  87. for {
  88. if tokenizer.Next() == html.ErrorToken {
  89. err := tokenizer.Err()
  90. if err == io.EOF {
  91. // End of input means end of processing
  92. return &buff
  93. }
  94. // Raw tokenizer error
  95. return &bytes.Buffer{}
  96. }
  97. token := tokenizer.Token()
  98. switch token.Type {
  99. case html.DoctypeToken:
  100. if p.allowDocType {
  101. buff.WriteString(token.String())
  102. }
  103. case html.CommentToken:
  104. // Comments are ignored by default
  105. case html.StartTagToken:
  106. mostRecentlyStartedToken = token.Data
  107. aps, ok := p.elsAndAttrs[token.Data]
  108. if !ok {
  109. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  110. skipElementContent = true
  111. skippingElementsCount++
  112. }
  113. if p.addSpaces {
  114. buff.WriteString(" ")
  115. }
  116. break
  117. }
  118. if len(token.Attr) != 0 {
  119. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  120. }
  121. if len(token.Attr) == 0 {
  122. if !p.allowNoAttrs(token.Data) {
  123. skipClosingTag = true
  124. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  125. if p.addSpaces {
  126. buff.WriteString(" ")
  127. }
  128. break
  129. }
  130. }
  131. if !skipElementContent {
  132. buff.WriteString(token.String())
  133. }
  134. case html.EndTagToken:
  135. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  136. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  137. if len(closingTagToSkipStack) == 0 {
  138. skipClosingTag = false
  139. }
  140. if p.addSpaces {
  141. buff.WriteString(" ")
  142. }
  143. break
  144. }
  145. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  146. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  147. skippingElementsCount--
  148. if skippingElementsCount == 0 {
  149. skipElementContent = false
  150. }
  151. }
  152. if p.addSpaces {
  153. buff.WriteString(" ")
  154. }
  155. break
  156. }
  157. if !skipElementContent {
  158. buff.WriteString(token.String())
  159. }
  160. case html.SelfClosingTagToken:
  161. aps, ok := p.elsAndAttrs[token.Data]
  162. if !ok {
  163. if p.addSpaces {
  164. buff.WriteString(" ")
  165. }
  166. break
  167. }
  168. if len(token.Attr) != 0 {
  169. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  170. }
  171. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  172. if p.addSpaces {
  173. buff.WriteString(" ")
  174. }
  175. break
  176. }
  177. if !skipElementContent {
  178. buff.WriteString(token.String())
  179. }
  180. case html.TextToken:
  181. if !skipElementContent {
  182. switch strings.ToLower(mostRecentlyStartedToken) {
  183. case "javascript":
  184. // not encouraged, but if a policy allows JavaScript we
  185. // should not HTML escape it as that would break the output
  186. buff.WriteString(token.Data)
  187. case "style":
  188. // not encouraged, but if a policy allows CSS styles we
  189. // should not HTML escape it as that would break the output
  190. buff.WriteString(token.Data)
  191. default:
  192. // HTML escape the text
  193. buff.WriteString(token.String())
  194. }
  195. }
  196. default:
  197. // A token that didn't exist in the html package when we wrote this
  198. return &bytes.Buffer{}
  199. }
  200. }
  201. }
  202. // sanitizeAttrs takes a set of element attribute policies and the global
  203. // attribute policies and applies them to the []html.Attribute returning a set
  204. // of html.Attributes that match the policies
  205. func (p *Policy) sanitizeAttrs(
  206. elementName string,
  207. attrs []html.Attribute,
  208. aps map[string]attrPolicy,
  209. ) []html.Attribute {
  210. if len(attrs) == 0 {
  211. return attrs
  212. }
  213. // Builds a new attribute slice based on the whether the attribute has been
  214. // whitelisted explicitly or globally.
  215. cleanAttrs := []html.Attribute{}
  216. for _, htmlAttr := range attrs {
  217. // Is there an element specific attribute policy that applies?
  218. if ap, ok := aps[htmlAttr.Key]; ok {
  219. if ap.regexp != nil {
  220. if ap.regexp.MatchString(htmlAttr.Val) {
  221. cleanAttrs = append(cleanAttrs, htmlAttr)
  222. continue
  223. }
  224. } else {
  225. cleanAttrs = append(cleanAttrs, htmlAttr)
  226. continue
  227. }
  228. }
  229. // Is there a global attribute policy that applies?
  230. if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
  231. if ap.regexp != nil {
  232. if ap.regexp.MatchString(htmlAttr.Val) {
  233. cleanAttrs = append(cleanAttrs, htmlAttr)
  234. }
  235. } else {
  236. cleanAttrs = append(cleanAttrs, htmlAttr)
  237. }
  238. }
  239. }
  240. if len(cleanAttrs) == 0 {
  241. // If nothing was allowed, let's get out of here
  242. return cleanAttrs
  243. }
  244. // cleanAttrs now contains the attributes that are permitted
  245. if linkable(elementName) {
  246. if p.requireParseableURLs {
  247. // Ensure URLs are parseable:
  248. // - a.href
  249. // - area.href
  250. // - link.href
  251. // - blockquote.cite
  252. // - q.cite
  253. // - img.src
  254. // - script.src
  255. tmpAttrs := []html.Attribute{}
  256. for _, htmlAttr := range cleanAttrs {
  257. switch elementName {
  258. case "a", "area", "link":
  259. if htmlAttr.Key == "href" {
  260. if u, ok := p.validURL(htmlAttr.Val); ok {
  261. htmlAttr.Val = u
  262. tmpAttrs = append(tmpAttrs, htmlAttr)
  263. }
  264. break
  265. }
  266. tmpAttrs = append(tmpAttrs, htmlAttr)
  267. case "blockquote", "q":
  268. if htmlAttr.Key == "cite" {
  269. if u, ok := p.validURL(htmlAttr.Val); ok {
  270. htmlAttr.Val = u
  271. tmpAttrs = append(tmpAttrs, htmlAttr)
  272. }
  273. break
  274. }
  275. tmpAttrs = append(tmpAttrs, htmlAttr)
  276. case "img", "script":
  277. if htmlAttr.Key == "src" {
  278. if u, ok := p.validURL(htmlAttr.Val); ok {
  279. htmlAttr.Val = u
  280. tmpAttrs = append(tmpAttrs, htmlAttr)
  281. }
  282. break
  283. }
  284. tmpAttrs = append(tmpAttrs, htmlAttr)
  285. default:
  286. tmpAttrs = append(tmpAttrs, htmlAttr)
  287. }
  288. }
  289. cleanAttrs = tmpAttrs
  290. }
  291. if (p.requireNoFollow ||
  292. p.requireNoFollowFullyQualifiedLinks ||
  293. p.addTargetBlankToFullyQualifiedLinks) &&
  294. len(cleanAttrs) > 0 {
  295. // Add rel="nofollow" if a "href" exists
  296. switch elementName {
  297. case "a", "area", "link":
  298. var hrefFound bool
  299. var externalLink bool
  300. for _, htmlAttr := range cleanAttrs {
  301. if htmlAttr.Key == "href" {
  302. hrefFound = true
  303. u, err := url.Parse(htmlAttr.Val)
  304. if err != nil {
  305. continue
  306. }
  307. if u.Host != "" {
  308. externalLink = true
  309. }
  310. continue
  311. }
  312. }
  313. if hrefFound {
  314. var (
  315. noFollowFound bool
  316. targetBlankFound bool
  317. )
  318. addNoFollow := (p.requireNoFollow ||
  319. externalLink && p.requireNoFollowFullyQualifiedLinks)
  320. addTargetBlank := (externalLink &&
  321. p.addTargetBlankToFullyQualifiedLinks)
  322. tmpAttrs := []html.Attribute{}
  323. for _, htmlAttr := range cleanAttrs {
  324. var appended bool
  325. if htmlAttr.Key == "rel" && addNoFollow {
  326. if strings.Contains(htmlAttr.Val, "nofollow") {
  327. noFollowFound = true
  328. tmpAttrs = append(tmpAttrs, htmlAttr)
  329. appended = true
  330. } else {
  331. htmlAttr.Val += " nofollow"
  332. noFollowFound = true
  333. tmpAttrs = append(tmpAttrs, htmlAttr)
  334. appended = true
  335. }
  336. }
  337. if elementName == "a" && htmlAttr.Key == "target" {
  338. if htmlAttr.Val == "_blank" {
  339. targetBlankFound = true
  340. }
  341. if addTargetBlank && !targetBlankFound {
  342. htmlAttr.Val = "_blank"
  343. targetBlankFound = true
  344. tmpAttrs = append(tmpAttrs, htmlAttr)
  345. appended = true
  346. }
  347. }
  348. if !appended {
  349. tmpAttrs = append(tmpAttrs, htmlAttr)
  350. }
  351. }
  352. if noFollowFound || targetBlankFound {
  353. cleanAttrs = tmpAttrs
  354. }
  355. if addNoFollow && !noFollowFound {
  356. rel := html.Attribute{}
  357. rel.Key = "rel"
  358. rel.Val = "nofollow"
  359. cleanAttrs = append(cleanAttrs, rel)
  360. }
  361. if elementName == "a" && addTargetBlank && !targetBlankFound {
  362. rel := html.Attribute{}
  363. rel.Key = "target"
  364. rel.Val = "_blank"
  365. targetBlankFound = true
  366. cleanAttrs = append(cleanAttrs, rel)
  367. }
  368. if targetBlankFound {
  369. // target="_blank" has a security risk that allows the
  370. // opened window/tab to issue JavaScript calls against
  371. // window.opener, which in effect allow the destination
  372. // of the link to control the source:
  373. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  374. //
  375. // To mitigate this risk, we need to add a specific rel
  376. // attribute if it is not already present.
  377. // rel="noopener"
  378. //
  379. // Unfortunately this is processing the rel twice (we
  380. // already looked at it earlier ^^) as we cannot be sure
  381. // of the ordering of the href and rel, and whether we
  382. // have fully satisfied that we need to do this. This
  383. // double processing only happens *if* target="_blank"
  384. // is true.
  385. var noOpenerAdded bool
  386. tmpAttrs := []html.Attribute{}
  387. for _, htmlAttr := range cleanAttrs {
  388. var appended bool
  389. if htmlAttr.Key == "rel" {
  390. if strings.Contains(htmlAttr.Val, "noopener") {
  391. noOpenerAdded = true
  392. tmpAttrs = append(tmpAttrs, htmlAttr)
  393. } else {
  394. htmlAttr.Val += " noopener"
  395. noOpenerAdded = true
  396. tmpAttrs = append(tmpAttrs, htmlAttr)
  397. }
  398. appended = true
  399. }
  400. if !appended {
  401. tmpAttrs = append(tmpAttrs, htmlAttr)
  402. }
  403. }
  404. if noOpenerAdded {
  405. cleanAttrs = tmpAttrs
  406. } else {
  407. // rel attr was not found, or else noopener would
  408. // have been added already
  409. rel := html.Attribute{}
  410. rel.Key = "rel"
  411. rel.Val = "noopener"
  412. cleanAttrs = append(cleanAttrs, rel)
  413. }
  414. }
  415. }
  416. default:
  417. }
  418. }
  419. }
  420. return cleanAttrs
  421. }
  422. func (p *Policy) allowNoAttrs(elementName string) bool {
  423. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  424. return ok
  425. }
  426. func (p *Policy) validURL(rawurl string) (string, bool) {
  427. if p.requireParseableURLs {
  428. // URLs do not contain whitespace
  429. if strings.Contains(rawurl, " ") ||
  430. strings.Contains(rawurl, "\t") ||
  431. strings.Contains(rawurl, "\n") {
  432. return "", false
  433. }
  434. u, err := url.Parse(rawurl)
  435. if err != nil {
  436. return "", false
  437. }
  438. if u.Scheme != "" {
  439. urlPolicy, ok := p.allowURLSchemes[u.Scheme]
  440. if !ok {
  441. return "", false
  442. }
  443. if urlPolicy == nil || urlPolicy(u) == true {
  444. return u.String(), true
  445. }
  446. return "", false
  447. }
  448. if p.allowRelativeURLs {
  449. if u.String() != "" {
  450. return u.String(), true
  451. }
  452. }
  453. return "", false
  454. }
  455. return rawurl, true
  456. }
  457. func linkable(elementName string) bool {
  458. switch elementName {
  459. case "a", "area", "blockquote", "img", "link", "script":
  460. return true
  461. default:
  462. return false
  463. }
  464. }
上海开阖软件有限公司 沪ICP备12045867号-1