html.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. // Copyright (c) 2016-2017 Marcus Rohrmoser, http://purl.mro.name/recorder
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
  4. // associated documentation files (the "Software"), to deal in the Software without restriction,
  5. // including without limitation the rights to use, copy, modify, merge, publish, distribute,
  6. // sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
  7. // furnished to do so, subject to the following conditions:
  8. //
  9. // The above copyright notice and this permission notice shall be included in all copies or
  10. // substantial portions of the Software.
  11. //
  12. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
  13. // NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  14. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
  15. // OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  16. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  17. //
  18. // MIT License http://opensource.org/licenses/MIT
  19. // HTML helpers.
  20. //
  21. // import "purl.mro.name/recorder/radio/scrape"
  22. package scrape
  23. import (
  24. "regexp"
  25. "strings"
  26. "unicode"
  27. "github.com/yhat/scrape"
  28. "golang.org/x/net/html"
  29. "golang.org/x/net/html/atom"
  30. )
  31. const lineFeedMarker = "55855D6B-4E49-4B83-BE50-082ECB380AB1"
  32. func TextWithBrFromNodeSet(nodes []*html.Node) string {
  33. parts := make([]string, len(nodes))
  34. for i, node := range nodes {
  35. for _, tag := range []atom.Atom{atom.Br, atom.Tr} {
  36. for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) {
  37. lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker}
  38. n.Parent.InsertBefore(&lfn, n.NextSibling)
  39. }
  40. }
  41. for _, tag := range []atom.Atom{atom.P, atom.Div} {
  42. for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return tag == n.DataAtom }) {
  43. lfn := html.Node{Type: html.TextNode, Data: lineFeedMarker + lineFeedMarker}
  44. n.Parent.InsertBefore(&lfn, n.NextSibling)
  45. }
  46. }
  47. tmp := []string{}
  48. for _, n := range scrape.FindAll(node, func(n *html.Node) bool { return html.TextNode == n.Type }) {
  49. tmp = append(tmp, n.Data)
  50. }
  51. parts[i] = strings.Join(tmp, "")
  52. }
  53. ret := strings.Join(parts, lineFeedMarker+lineFeedMarker)
  54. ret = NormaliseWhiteSpace(ret)
  55. ret = strings.Replace(ret, lineFeedMarker, "\n", -1)
  56. re := regexp.MustCompile("[ ]*(\\s)[ ]*") // collapse whitespace, keep \n
  57. ret = re.ReplaceAllString(ret, "$1") // collapse whitespace (not the \n\n however)
  58. {
  59. re := regexp.MustCompile("\\s*\\n\\s*\\n\\s*") // collapse linefeeds
  60. ret = re.ReplaceAllString(ret, "\n\n")
  61. }
  62. return strings.TrimSpace(ret)
  63. }
  64. func TextChildrenNoClimb(node *html.Node) string {
  65. ret := []string{}
  66. for n := node.FirstChild; nil != n; n = n.NextSibling {
  67. if html.TextNode != n.Type {
  68. continue
  69. }
  70. ret = append(ret, strings.TrimSpace(n.Data))
  71. }
  72. return strings.Join(ret, "")
  73. }
  74. func NormaliseWhiteSpace(s string) string {
  75. return strings.Map(func(r rune) rune {
  76. if unicode.IsSpace(r) {
  77. return rune(32)
  78. }
  79. return r
  80. }, s)
  81. }