sanitize.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "io"
  33. "net/url"
  34. "strings"
  35. "golang.org/x/net/html"
  36. )
  37. // Sanitize takes a string that contains a HTML fragment or document and applies
  38. // the given policy whitelist.
  39. //
  40. // It returns a HTML string that has been sanitized by the policy or an empty
  41. // string if an error has occurred (most likely as a consequence of extremely
  42. // malformed input)
  43. func (p *Policy) Sanitize(s string) string {
  44. if strings.TrimSpace(s) == "" {
  45. return s
  46. }
  47. return p.sanitize(strings.NewReader(s)).String()
  48. }
  49. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  50. // the given policy whitelist.
  51. //
  52. // It returns a []byte containing the HTML that has been sanitized by the policy
  53. // or an empty []byte if an error has occurred (most likely as a consequence of
  54. // extremely malformed input)
  55. func (p *Policy) SanitizeBytes(b []byte) []byte {
  56. if len(bytes.TrimSpace(b)) == 0 {
  57. return b
  58. }
  59. return p.sanitize(bytes.NewReader(b)).Bytes()
  60. }
  61. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  62. // and applies the given policy whitelist.
  63. //
  64. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  65. // policy. Errors during sanitization will merely return an empty result.
  66. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  67. return p.sanitize(r)
  68. }
  69. // Performs the actual sanitization process.
  70. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
  71. // It is possible that the developer has created the policy via:
  72. // p := bluemonday.Policy{}
  73. // rather than:
  74. // p := bluemonday.NewPolicy()
  75. // If this is the case, and if they haven't yet triggered an action that
  76. // would initiliaze the maps, then we need to do that.
  77. p.init()
  78. var (
  79. buff bytes.Buffer
  80. skipElementContent bool
  81. skippingElementsCount int64
  82. skipClosingTag bool
  83. closingTagToSkipStack []string
  84. mostRecentlyStartedToken string
  85. )
  86. tokenizer := html.NewTokenizer(r)
  87. for {
  88. if tokenizer.Next() == html.ErrorToken {
  89. err := tokenizer.Err()
  90. if err == io.EOF {
  91. // End of input means end of processing
  92. return &buff
  93. }
  94. // Raw tokenizer error
  95. return &bytes.Buffer{}
  96. }
  97. token := tokenizer.Token()
  98. switch token.Type {
  99. case html.DoctypeToken:
  100. if p.allowDocType {
  101. buff.WriteString(token.String())
  102. }
  103. case html.CommentToken:
  104. // Comments are ignored by default
  105. case html.StartTagToken:
  106. mostRecentlyStartedToken = token.Data
  107. aps, ok := p.elsAndAttrs[token.Data]
  108. if !ok {
  109. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  110. skipElementContent = true
  111. skippingElementsCount++
  112. }
  113. if p.addSpaces {
  114. buff.WriteString(" ")
  115. }
  116. break
  117. }
  118. if len(token.Attr) != 0 {
  119. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  120. }
  121. if len(token.Attr) == 0 {
  122. if !p.allowNoAttrs(token.Data) {
  123. skipClosingTag = true
  124. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  125. if p.addSpaces {
  126. buff.WriteString(" ")
  127. }
  128. break
  129. }
  130. }
  131. if !skipElementContent {
  132. buff.WriteString(token.String())
  133. }
  134. case html.EndTagToken:
  135. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  136. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  137. if len(closingTagToSkipStack) == 0 {
  138. skipClosingTag = false
  139. }
  140. if p.addSpaces {
  141. buff.WriteString(" ")
  142. }
  143. break
  144. }
  145. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  146. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  147. skippingElementsCount--
  148. if skippingElementsCount == 0 {
  149. skipElementContent = false
  150. }
  151. }
  152. if p.addSpaces {
  153. buff.WriteString(" ")
  154. }
  155. break
  156. }
  157. if !skipElementContent {
  158. buff.WriteString(token.String())
  159. }
  160. case html.SelfClosingTagToken:
  161. aps, ok := p.elsAndAttrs[token.Data]
  162. if !ok {
  163. if p.addSpaces {
  164. buff.WriteString(" ")
  165. }
  166. break
  167. }
  168. if len(token.Attr) != 0 {
  169. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  170. }
  171. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  172. if p.addSpaces {
  173. buff.WriteString(" ")
  174. }
  175. break
  176. }
  177. if !skipElementContent {
  178. buff.WriteString(token.String())
  179. }
  180. case html.TextToken:
  181. if !skipElementContent {
  182. switch strings.ToLower(mostRecentlyStartedToken) {
  183. case "javascript":
  184. // not encouraged, but if a policy allows JavaScript we
  185. // should not HTML escape it as that would break the output
  186. buff.WriteString(token.Data)
  187. case "style":
  188. // not encouraged, but if a policy allows CSS styles we
  189. // should not HTML escape it as that would break the output
  190. buff.WriteString(token.Data)
  191. default:
  192. // HTML escape the text
  193. buff.WriteString(token.String())
  194. }
  195. }
  196. default:
  197. // A token that didn't exist in the html package when we wrote this
  198. return &bytes.Buffer{}
  199. }
  200. }
  201. }
  202. // sanitizeAttrs takes a set of element attribute policies and the global
  203. // attribute policies and applies them to the []html.Attribute returning a set
  204. // of html.Attributes that match the policies
  205. func (p *Policy) sanitizeAttrs(
  206. elementName string,
  207. attrs []html.Attribute,
  208. aps map[string]attrPolicy,
  209. ) []html.Attribute {
  210. if len(attrs) == 0 {
  211. return attrs
  212. }
  213. // Builds a new attribute slice based on the whether the attribute has been
  214. // whitelisted explicitly or globally.
  215. cleanAttrs := []html.Attribute{}
  216. for _, htmlAttr := range attrs {
  217. // Is there an element specific attribute policy that applies?
  218. if ap, ok := aps[htmlAttr.Key]; ok {
  219. if ap.regexp != nil {
  220. if ap.regexp.MatchString(htmlAttr.Val) {
  221. cleanAttrs = append(cleanAttrs, htmlAttr)
  222. continue
  223. }
  224. } else {
  225. cleanAttrs = append(cleanAttrs, htmlAttr)
  226. continue
  227. }
  228. }
  229. // Is there a global attribute policy that applies?
  230. if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
  231. if ap.regexp != nil {
  232. if ap.regexp.MatchString(htmlAttr.Val) {
  233. cleanAttrs = append(cleanAttrs, htmlAttr)
  234. }
  235. } else {
  236. cleanAttrs = append(cleanAttrs, htmlAttr)
  237. }
  238. }
  239. }
  240. if len(cleanAttrs) == 0 {
  241. // If nothing was allowed, let's get out of here
  242. return cleanAttrs
  243. }
  244. // cleanAttrs now contains the attributes that are permitted
  245. if linkable(elementName) {
  246. if p.requireParseableURLs {
  247. // Ensure URLs are parseable:
  248. // - a.href
  249. // - area.href
  250. // - link.href
  251. // - blockquote.cite
  252. // - q.cite
  253. // - img.src
  254. // - script.src
  255. tmpAttrs := []html.Attribute{}
  256. for _, htmlAttr := range cleanAttrs {
  257. switch elementName {
  258. case "a", "area", "link":
  259. if htmlAttr.Key == "href" {
  260. if u, ok := p.validURL(htmlAttr.Val); ok {
  261. htmlAttr.Val = u
  262. tmpAttrs = append(tmpAttrs, htmlAttr)
  263. }
  264. break
  265. }
  266. tmpAttrs = append(tmpAttrs, htmlAttr)
  267. case "blockquote", "q":
  268. if htmlAttr.Key == "cite" {
  269. if u, ok := p.validURL(htmlAttr.Val); ok {
  270. htmlAttr.Val = u
  271. tmpAttrs = append(tmpAttrs, htmlAttr)
  272. }
  273. break
  274. }
  275. tmpAttrs = append(tmpAttrs, htmlAttr)
  276. case "img", "script":
  277. if htmlAttr.Key == "src" {
  278. if u, ok := p.validURL(htmlAttr.Val); ok {
  279. htmlAttr.Val = u
  280. tmpAttrs = append(tmpAttrs, htmlAttr)
  281. }
  282. break
  283. }
  284. tmpAttrs = append(tmpAttrs, htmlAttr)
  285. default:
  286. tmpAttrs = append(tmpAttrs, htmlAttr)
  287. }
  288. }
  289. cleanAttrs = tmpAttrs
  290. }
  291. if (p.requireNoFollow ||
  292. p.requireNoFollowFullyQualifiedLinks ||
  293. p.addTargetBlankToFullyQualifiedLinks) &&
  294. len(cleanAttrs) > 0 {
  295. // Add rel="nofollow" if a "href" exists
  296. switch elementName {
  297. case "a", "area", "link":
  298. var hrefFound bool
  299. var externalLink bool
  300. for _, htmlAttr := range cleanAttrs {
  301. if htmlAttr.Key == "href" {
  302. hrefFound = true
  303. u, err := url.Parse(htmlAttr.Val)
  304. if err != nil {
  305. continue
  306. }
  307. if u.Host != "" {
  308. externalLink = true
  309. }
  310. continue
  311. }
  312. }
  313. if hrefFound {
  314. var (
  315. noFollowFound bool
  316. targetBlankFound bool
  317. )
  318. addNoFollow := (p.requireNoFollow ||
  319. externalLink && p.requireNoFollowFullyQualifiedLinks)
  320. addTargetBlank := (externalLink &&
  321. p.addTargetBlankToFullyQualifiedLinks)
  322. tmpAttrs := []html.Attribute{}
  323. for _, htmlAttr := range cleanAttrs {
  324. var appended bool
  325. if htmlAttr.Key == "rel" && addNoFollow {
  326. if strings.Contains(htmlAttr.Val, "nofollow") {
  327. noFollowFound = true
  328. tmpAttrs = append(tmpAttrs, htmlAttr)
  329. appended = true
  330. } else {
  331. htmlAttr.Val += " nofollow"
  332. noFollowFound = true
  333. tmpAttrs = append(tmpAttrs, htmlAttr)
  334. appended = true
  335. }
  336. }
  337. if elementName == "a" && htmlAttr.Key == "target" {
  338. if htmlAttr.Val == "_blank" {
  339. targetBlankFound = true
  340. }
  341. if addTargetBlank && !targetBlankFound {
  342. htmlAttr.Val = "_blank"
  343. targetBlankFound = true
  344. tmpAttrs = append(tmpAttrs, htmlAttr)
  345. appended = true
  346. }
  347. }
  348. if !appended {
  349. tmpAttrs = append(tmpAttrs, htmlAttr)
  350. }
  351. }
  352. if noFollowFound || targetBlankFound {
  353. cleanAttrs = tmpAttrs
  354. }
  355. if addNoFollow && !noFollowFound {
  356. rel := html.Attribute{}
  357. rel.Key = "rel"
  358. rel.Val = "nofollow"
  359. cleanAttrs = append(cleanAttrs, rel)
  360. }
  361. if elementName == "a" && addTargetBlank && !targetBlankFound {
  362. rel := html.Attribute{}
  363. rel.Key = "target"
  364. rel.Val = "_blank"
  365. targetBlankFound = true
  366. cleanAttrs = append(cleanAttrs, rel)
  367. }
  368. if targetBlankFound {
  369. // target="_blank" has a security risk that allows the
  370. // opened window/tab to issue JavaScript calls against
  371. // window.opener, which in effect allow the destination
  372. // of the link to control the source:
  373. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  374. //
  375. // To mitigate this risk, we need to add a specific rel
  376. // attribute if it is not already present.
  377. // rel="noopener"
  378. //
  379. // Unfortunately this is processing the rel twice (we
  380. // already looked at it earlier ^^) as we cannot be sure
  381. // of the ordering of the href and rel, and whether we
  382. // have fully satisfied that we need to do this. This
  383. // double processing only happens *if* target="_blank"
  384. // is true.
  385. var noOpenerAdded bool
  386. tmpAttrs := []html.Attribute{}
  387. for _, htmlAttr := range cleanAttrs {
  388. var appended bool
  389. if htmlAttr.Key == "rel" {
  390. if strings.Contains(htmlAttr.Val, "noopener") {
  391. noOpenerAdded = true
  392. tmpAttrs = append(tmpAttrs, htmlAttr)
  393. } else {
  394. htmlAttr.Val += " noopener"
  395. noOpenerAdded = true
  396. tmpAttrs = append(tmpAttrs, htmlAttr)
  397. }
  398. appended = true
  399. }
  400. if !appended {
  401. tmpAttrs = append(tmpAttrs, htmlAttr)
  402. }
  403. }
  404. if noOpenerAdded {
  405. cleanAttrs = tmpAttrs
  406. } else {
  407. // rel attr was not found, or else noopener would
  408. // have been added already
  409. rel := html.Attribute{}
  410. rel.Key = "rel"
  411. rel.Val = "noopener"
  412. cleanAttrs = append(cleanAttrs, rel)
  413. }
  414. }
  415. }
  416. default:
  417. }
  418. }
  419. }
  420. return cleanAttrs
  421. }
  422. func (p *Policy) allowNoAttrs(elementName string) bool {
  423. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  424. return ok
  425. }
  426. func (p *Policy) validURL(rawurl string) (string, bool) {
  427. if p.requireParseableURLs {
  428. // URLs do not contain whitespace
  429. if strings.Contains(rawurl, " ") ||
  430. strings.Contains(rawurl, "\t") ||
  431. strings.Contains(rawurl, "\n") {
  432. return "", false
  433. }
  434. u, err := url.Parse(rawurl)
  435. if err != nil {
  436. return "", false
  437. }
  438. if u.Scheme != "" {
  439. urlPolicy, ok := p.allowURLSchemes[u.Scheme]
  440. if !ok {
  441. return "", false
  442. }
  443. if urlPolicy == nil || urlPolicy(u) == true {
  444. return u.String(), true
  445. }
  446. return "", false
  447. }
  448. if p.allowRelativeURLs {
  449. if u.String() != "" {
  450. return u.String(), true
  451. }
  452. }
  453. return "", false
  454. }
  455. return rawurl, true
  456. }
  457. func linkable(elementName string) bool {
  458. switch elementName {
  459. case "a", "area", "blockquote", "img", "link", "script":
  460. return true
  461. default:
  462. return false
  463. }
  464. }