atom.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. //
  2. // Copyright (C) 2017-2021 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. //
  17. package main
  18. import (
  19. "encoding/base64"
  20. "encoding/binary"
  21. "encoding/xml"
  22. "errors"
  23. "fmt"
  24. "io"
  25. "os"
  26. "sort"
  27. "strconv"
  28. "strings"
  29. "time"
  30. // "golang.org/x/tools/blog/atom"
  31. "github.com/yhat/scrape"
  32. "golang.org/x/net/html"
  33. )
  34. const lengthyAtomPreambleComment string = `
  35. https://developer.mozilla.org/en/docs/XSL_Transformations_in_Mozilla_FAQ#Why_isn.27t_my_stylesheet_applied.3F
  36. Caution! Firefox ignores the XSLT if the XML looks like a RSS or Atom feed.
  37. So add a comment to the XML file to push the <fEEd or <rsS tag out of the first
  38. 512 bytes looked at by Firefox to guess whether it is a feed or not.
  39. https://bugzilla.mozilla.org/show_bug.cgi?id=338621#c72
  40. `
  41. const atomNamespace = "http://www.w3.org/2005/Atom"
  42. func FeedFromFileName(file string) (Feed, error) {
  43. if read, err := os.Open(file); nil == read || nil != err {
  44. return Feed{}, err
  45. } else {
  46. defer read.Close()
  47. return FeedFromReader(read)
  48. }
  49. }
  50. func FeedFromReader(file io.Reader) (Feed, error) {
  51. ret := Feed{}
  52. err := xml.NewDecoder(file).Decode(&ret)
  53. return ret, err
  54. }
  55. type Iri string // https://tools.ietf.org/html/rfc3987
  56. type Id Iri // we allow relative Ids (in persistent store)
  57. type Lang string // https://tools.ietf.org/html/rfc3066
  58. type Relation string // https://www.iana.org/assignments/link-relations/link-relations.xhtml#link-relations-1
  59. type MimeType string // https://tools.ietf.org/html/rfc2045#section-5.1
  60. type TextType string // https://tools.ietf.org/html/rfc4287#section-4.1.3.1
  61. // https://mro.github.io/atomenabled.org/
  62. // https://tools.ietf.org/html/rfc4287#section-4.1.1
  63. //
  64. // see also https://godoc.org/golang.org/x/tools/blog/atom#Feed
  65. type Feed struct {
  66. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  67. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  68. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  69. XmlNSShaarliGo string `xml:"xmlns:sg,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  70. SearchTerms string `xml:"sg:searchTerms,attr,omitempty"` // rather use http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  71. XmlNSOpenSearch string `xml:"xmlns:opensearch,attr,omitempty"` // https://github.com/golang/go/issues/9519#issuecomment-252196382
  72. Query string `xml:"opensearch:Query,omitempty"` // http://www.opensearch.org/Specifications/OpenSearch/1.1#Example_of_OpenSearch_response_elements_in_Atom_1.0
  73. Title HumanText `xml:"title"`
  74. Subtitle *HumanText `xml:"subtitle,omitempty"`
  75. Id Id `xml:"id"`
  76. Updated iso8601 `xml:"updated"`
  77. Generator *Generator `xml:"generator,omitempty"`
  78. Icon Iri `xml:"icon,omitempty"`
  79. Logo Iri `xml:"logo,omitempty"`
  80. Links []Link `xml:"link"`
  81. Categories []Category `xml:"category"`
  82. Authors []Person `xml:"author"`
  83. Contributors []Person `xml:"contributor"`
  84. Rights *HumanText `xml:"rights,omitempty"`
  85. Entries []*Entry `xml:"entry"`
  86. }
  87. type Generator struct {
  88. Uri Iri `xml:"uri,attr"`
  89. Version string `xml:"version,attr,omitempty"`
  90. Body string `xml:",chardata"`
  91. }
  92. // http://stackoverflow.com/a/25015260
  93. type iso8601 time.Time
  94. func (v iso8601) IsZero() bool { return time.Time(v).IsZero() }
  95. func (a iso8601) After(b iso8601) bool { return time.Time(a).After(time.Time(b)) }
  96. func (a iso8601) Before(b iso8601) bool { return time.Time(a).Before(time.Time(b)) }
  97. func (a iso8601) Format(fmt string) string { return time.Time(a).Format(fmt) }
  98. func (v iso8601) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  99. e.EncodeElement(v.Format(time.RFC3339), start)
  100. return nil
  101. }
  102. func (c *iso8601) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  103. var v string
  104. d.DecodeElement(&v, &start)
  105. if parse, err := time.Parse(time.RFC3339, v); err != nil {
  106. return err
  107. } else {
  108. *c = iso8601(parse)
  109. return nil
  110. }
  111. }
  112. // see also https://godoc.org/golang.org/x/tools/blog/atom#Link
  113. type Link struct {
  114. Href string `xml:"href,attr"`
  115. Rel Relation `xml:"rel,attr,omitempty"`
  116. Type MimeType `xml:"type,attr,omitempty"`
  117. HrefLang Lang `xml:"hreflang,attr,omitempty"`
  118. Title string `xml:"title,attr,omitempty"`
  119. Length int64 `xml:"length,attr,omitempty"`
  120. }
  121. // see also https://godoc.org/golang.org/x/tools/blog/atom#Person
  122. type Person struct {
  123. Name string `xml:"name"`
  124. Email string `xml:"email,omitempty"`
  125. Uri Iri `xml:"uri,omitempty"`
  126. }
  127. // see also https://godoc.org/golang.org/x/tools/blog/atom#Entry
  128. type Entry struct {
  129. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom entry,omitempty"`
  130. XmlBase Iri `xml:"xml:base,attr,omitempty"`
  131. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  132. Title HumanText `xml:"title"`
  133. Summary *HumanText `xml:"summary,omitempty"`
  134. Id Id `xml:"id"`
  135. Updated iso8601 `xml:"updated"`
  136. Published iso8601 `xml:"published,omitempty"`
  137. Links []Link `xml:"link"`
  138. Categories []Category `xml:"category"`
  139. Authors []Person `xml:"author"`
  140. Contributors []Person `xml:"contributor"`
  141. Content *HumanText `xml:"content"`
  142. // Vorsicht! beim Schreiben (Marshal/Encode) fuchst's noch: https://github.com/golang/go/issues/9519#issuecomment-252196382
  143. MediaThumbnail *MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail,omitempty"`
  144. GeoRssPoint *GeoRssPoint `xml:"http://www.georss.org/georss point,omitempty"`
  145. }
  146. type HumanText struct {
  147. XmlLang Lang `xml:"xml:lang,attr,omitempty"`
  148. Body string `xml:",chardata"`
  149. Type TextType `xml:"type,attr,omitempty"`
  150. Src Iri `xml:"src,attr,omitempty"`
  151. }
  152. type Category struct {
  153. Term string `xml:"term,attr"`
  154. Scheme Iri `xml:"scheme,attr,omitempty"`
  155. Label string `xml:"label,attr,omitempty"`
  156. }
  157. type MediaThumbnail struct {
  158. Url Iri `xml:"url,attr"`
  159. }
  160. type Latitude float32
  161. type Longitude float32
  162. type GeoRssPoint struct {
  163. Lat Latitude
  164. Lon Longitude
  165. }
  166. func (v GeoRssPoint) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
  167. e.EncodeElement(fmt.Sprintf("%f %f", v.Lat, v.Lon), start)
  168. return nil
  169. }
  170. func (c *GeoRssPoint) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  171. var v string
  172. d.DecodeElement(&v, &start)
  173. res := strings.SplitN(v, " ", 2)
  174. if len(res) != 2 {
  175. return errors.New("Not a proper 'lat lon' pair.")
  176. }
  177. lat, err := strconv.ParseFloat(res[0], 32)
  178. if err != nil {
  179. return err
  180. }
  181. lon, err := strconv.ParseFloat(res[1], 32)
  182. if err != nil {
  183. return err
  184. }
  185. *c = GeoRssPoint{Lat: Latitude(lat), Lon: Longitude(lon)}
  186. return nil
  187. }
  188. func xmlEncodeWithXslt(e interface{}, hrefXslt string, enc *xml.Encoder) error {
  189. comm := func() error {
  190. var err error
  191. if _, ok := e.(Feed); ok {
  192. // write the comment only for feeds
  193. if err = enc.EncodeToken(xml.Comment(lengthyAtomPreambleComment)); err == nil {
  194. err = enc.EncodeToken(xml.CharData("\n"))
  195. }
  196. }
  197. return err
  198. }
  199. var err error
  200. // preamble
  201. if err = enc.EncodeToken(xml.ProcInst{Target: "xml", Inst: []byte(`version="1.0" encoding="UTF-8"`)}); err == nil {
  202. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  203. if err = enc.EncodeToken(xml.ProcInst{Target: "xml-stylesheet", Inst: []byte("type='text/xsl' href='" + hrefXslt + "'")}); err == nil {
  204. if err = enc.EncodeToken(xml.CharData("\n")); err == nil {
  205. if err = comm(); err == nil {
  206. if err = enc.Encode(e); err == nil {
  207. err = enc.EncodeToken(xml.CharData("\n"))
  208. }
  209. }
  210. }
  211. }
  212. }
  213. }
  214. return err
  215. }
  216. func (feed *Feed) Append(e *Entry) (*Entry, error) {
  217. if err := e.Validate(); err != nil {
  218. return nil, err
  219. }
  220. // todo: pre-check uniqueness of Id
  221. feed.Entries = append(feed.Entries, e)
  222. return e, nil
  223. }
  224. // sort.Interface
  225. type ByPublishedDesc []*Entry
  226. func (a ByPublishedDesc) Len() int { return len(a) }
  227. func (a ByPublishedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  228. func (a ByPublishedDesc) Less(i, j int) bool { return !a[i].Published.Before(a[j].Published) }
  229. type ByUpdatedDesc []*Entry
  230. func (a ByUpdatedDesc) Len() int { return len(a) }
  231. func (a ByUpdatedDesc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  232. func (a ByUpdatedDesc) Less(i, j int) bool { return !a[i].Updated.Before(a[j].Updated) }
  233. // custom interface
  234. // sufficient for 32 bit.
  235. func base64ToBase24x7(b64 string) (string, error) {
  236. if data, err := base64.RawURLEncoding.DecodeString(b64); err != nil {
  237. return "", err
  238. } else {
  239. // check len(data) ?
  240. ui32 := binary.LittleEndian.Uint32(data)
  241. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  242. return strings.Map(mapBase24ToSuperCareful, base24), nil
  243. }
  244. }
  245. // Being "super-careful" https://code.mro.name/mro/ProgrammableWebSwartz2013/src/master/content/pages/2-building-for-users.md
  246. //
  247. // 0123456789abcdefghijklmn ->
  248. // 23456789abcdefghkrstuxyz
  249. func mapBase24ToSuperCareful(r rune) rune {
  250. digits := []rune("23456789abcdefghkrstuxyz")
  251. switch {
  252. case '0' <= r && r <= '9':
  253. return digits[:10][r-'0']
  254. case r >= 'a' && r <= 'n':
  255. return digits[10:][r-'a']
  256. }
  257. panic("ouch")
  258. }
  259. func newRandomId(t time.Time) Id {
  260. ui32 := uint32(t.Unix() & 0xFFFFFFFF) // unix time in seconds as uint32
  261. base24 := fmt.Sprintf("%07s", strconv.FormatUint(uint64(ui32), 24))
  262. return Id(strings.Map(mapBase24ToSuperCareful, base24))
  263. }
  264. func (feed Feed) newUniqueId(t time.Time) Id {
  265. id := newRandomId(t)
  266. for _, entry := range feed.Entries {
  267. if entry.Id == id {
  268. panic("id not unique")
  269. }
  270. }
  271. return id
  272. }
  273. func (feed Feed) newEntry(t time.Time) *Entry {
  274. defer un(trace("Feed.newEntry(t)"))
  275. return &Entry{
  276. Authors: feed.Authors,
  277. Published: iso8601(t),
  278. Id: feed.newUniqueId(t),
  279. }
  280. }
  281. func (feed *Feed) findEntry(doesMatch func(*Entry) bool) (int, *Entry) {
  282. defer un(trace(strings.Join([]string{"Feed.findEntry(f(*Entry))"}, "")))
  283. if nil != doesMatch {
  284. for idx, entry := range feed.Entries {
  285. if doesMatch(entry) {
  286. return idx, entry
  287. }
  288. }
  289. }
  290. return -1, nil
  291. }
  292. func (feed *Feed) findEntryById(id Id) (int, *Entry) {
  293. defer un(trace(strings.Join([]string{"Feed.findEntryById('", string(id), "')"}, "")))
  294. if "" != id {
  295. return feed.findEntry(func(entry *Entry) bool { return id == entry.Id })
  296. }
  297. return feed.findEntry(nil)
  298. }
  299. func (feed *Feed) deleteEntryById(id Id) *Entry {
  300. if i, entry := feed.findEntryById(id); i < 0 {
  301. return nil
  302. } else {
  303. a := feed.Entries
  304. // https://github.com/golang/go/wiki/SliceTricks
  305. copy(a[i:], a[i+1:])
  306. // a[len(a)-1] = nil // or the zero value of T
  307. feed.Entries = a[:len(a)-1]
  308. feed.Updated = iso8601(time.Now())
  309. // don' try to be smart. When removing days feeds, we rely on correct Published date.
  310. // entry.Published = iso8601{time.Time{}}
  311. // entry.Updated = entry.Published
  312. return entry
  313. }
  314. }
  315. func (feed Feed) SaveToFile(dst string) error {
  316. defer un(trace("Feed.SaveToFile"))
  317. sort.Sort(ByPublishedDesc(feed.Entries))
  318. // remove deleted entries? Maybe Published date zero.
  319. tmp := dst + "~"
  320. var err error
  321. var w *os.File
  322. if w, err = os.Create(tmp); err == nil {
  323. enc := xml.NewEncoder(w)
  324. enc.Indent("", " ")
  325. if err = enc.Encode(feed); err == nil {
  326. if err = enc.Flush(); err == nil {
  327. if err = w.Close(); err == nil {
  328. if err = os.Rename(dst, dst+".bak"); err == nil || os.IsNotExist(err) {
  329. if err = os.Rename(tmp, dst); err == nil {
  330. return nil
  331. }
  332. }
  333. }
  334. }
  335. }
  336. }
  337. return err
  338. }
  339. // Validate for storage
  340. func (entry *Entry) Validate() error {
  341. if "" == entry.Id {
  342. return errors.New("Entry may not have empty Id.")
  343. }
  344. if 1 < len(entry.Links) {
  345. return fmt.Errorf("Entry may not have more than one link. Entry.Id='%s'", entry.Id)
  346. }
  347. if 1 == len(entry.Links) {
  348. if "" == entry.Links[0].Href {
  349. return fmt.Errorf("Entry may not have empty link. Entry.Id='%s'", entry.Id)
  350. }
  351. url := mustParseURL(entry.Links[0].Href)
  352. if !url.IsAbs() {
  353. return fmt.Errorf("Entry must have absolute Link. Entry.Id='%s'", entry.Id)
  354. }
  355. if "" == url.Host {
  356. return fmt.Errorf("Entry must have Link with non-empty host. Entry.Id='%s'", entry.Id)
  357. }
  358. }
  359. return nil
  360. }
  361. func AggregateCategories(entries []*Entry) []Category {
  362. // aggregate & count feed entry categories
  363. cats := make(map[string]int, 1*len(entries)) // raw len guess
  364. for _, ent := range entries {
  365. for _, cat := range ent.Categories {
  366. cats[cat.Term] += 1
  367. }
  368. }
  369. cs := make([]Category, 0, len(cats))
  370. for term, count := range cats {
  371. if term != "" && count != 0 {
  372. cs = append(cs, Category{Term: term, Label: strconv.Itoa(count)})
  373. }
  374. }
  375. sort.Slice(cs, func(i, j int) bool {
  376. return strings.Compare(cs[i].Term, cs[j].Term) < 0
  377. })
  378. return cs
  379. }
  380. func (ht HumanText) Categories() []Category {
  381. ret := make([]Category, 0, 10)
  382. for _, t := range tagsFromString(ht.Body) {
  383. ret = append(ret, Category{Term: t})
  384. }
  385. return ret
  386. }
  387. const iWillBeALineFeedMarker = "+,zX@D4X#%`lGdX-vWU?/==v"
  388. func cleanLegacyContent(txt string) string {
  389. src := strings.Replace(txt, "<br />", iWillBeALineFeedMarker, -1)
  390. if node, err := html.Parse(strings.NewReader(src)); err == nil {
  391. str := strings.Replace(scrape.Text(node), iWillBeALineFeedMarker, "", -1)
  392. return strings.Trim(str[:len(str)-len("( Permalink )")], " ")
  393. } else {
  394. return err.Error()
  395. }
  396. }