comb.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. //
  2. // Copyright (C) 2017-2021 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "io"
  20. "net/url"
  21. "strings"
  22. "time"
  23. "github.com/yhat/scrape"
  24. "golang.org/x/net/html"
  25. "golang.org/x/net/html/atom"
  26. )
  27. var serverLocation *time.Location
  28. func init() {
  29. // TODO rather use app settings?
  30. serverLocation, _ = time.LoadLocation("Europe/Berlin")
  31. }
  32. func entryFromURL(ur *url.URL, timeout time.Duration) (Entry, error) {
  33. if r, err := HttpGetBody(ur, timeout); err != nil {
  34. return Entry{}, err
  35. } else {
  36. return entryFromReader(r, ur)
  37. }
  38. }
  39. func entryFromReader(r io.Reader, ur *url.URL) (Entry, error) {
  40. if root, err := html.Parse(r); err != nil {
  41. return Entry{}, err
  42. } else {
  43. return entryFromNode(root, ur)
  44. }
  45. }
  46. func entryFromNode(root *html.Node, ur *url.URL) (Entry, error) {
  47. ret := Entry{}
  48. for _, node := range scrape.FindAll(root, func(n *html.Node) bool {
  49. return root == n.Parent && html.ElementNode == n.Type && atom.Html == n.DataAtom
  50. }) {
  51. ret.XmlLang = Lang(scrape.Attr(node, "lang"))
  52. break
  53. }
  54. for _, node := range scrape.FindAll(root, func(n *html.Node) bool {
  55. return html.ElementNode == n.Type && (atom.Meta == n.DataAtom || atom.Title == n.DataAtom)
  56. }) {
  57. strName := scrape.Attr(node, "name")
  58. strProp := scrape.Attr(node, "property")
  59. strContent := scrape.Attr(node, "content")
  60. switch {
  61. case atom.Title == node.DataAtom:
  62. ret.Title = HumanText{Body: scrape.Text(node)}
  63. case "title" == strName:
  64. ret.Title = HumanText{Body: strContent}
  65. case "description" == strName:
  66. ret.Summary = &HumanText{Body: strContent}
  67. case "author" == strName:
  68. ret.Authors = append(ret.Authors, Person{Name: strContent})
  69. case "date" == strName:
  70. var t time.Time
  71. var err error
  72. if t, err = time.Parse(time.RFC3339, strContent); err != nil {
  73. if t, err = time.ParseInLocation("2006-01-02T15:04:05Z0700", strContent, serverLocation); err != nil {
  74. if t, err = time.ParseInLocation("2006-01-02T15:04:05", strContent, serverLocation); err != nil {
  75. //panic(err)
  76. }
  77. }
  78. }
  79. if err == nil {
  80. ret.Published = iso8601(t)
  81. }
  82. case "keywords" == strName:
  83. for _, txt := range strings.Split(strContent, ",") {
  84. if t := strings.Replace(strings.TrimSpace(txt), " ", "_", -1); "" != t {
  85. ret.Categories = append(ret.Categories, Category{Term: t})
  86. }
  87. }
  88. case "og:title" == strProp:
  89. ret.Title = HumanText{Body: strContent}
  90. case "og:description" == strProp:
  91. ret.Summary = &HumanText{Body: strContent}
  92. case nil == ret.MediaThumbnail && "og:image" == strProp:
  93. ret.MediaThumbnail = &MediaThumbnail{Url: Iri(strContent)}
  94. }
  95. }
  96. return ret, nil
  97. }