mro
/
ShaarliGo
mirror of https://code.mro.name/mro/ShaarliGo


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							//
// Copyright (C) 2017-2021 Marcus Rohrmoser, http://purl.mro.name/ShaarliGo
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.
//

package main

import (
	"io"
	"net/url"
	"strings"
	"time"

	"github.com/yhat/scrape"
	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

var serverLocation *time.Location

func init() {
	// TODO rather use app settings?
	serverLocation, _ = time.LoadLocation("Europe/Berlin")
}

func entryFromURL(ur *url.URL, timeout time.Duration) (Entry, error) {
	if r, err := HttpGetBody(ur, timeout); err != nil {
		return Entry{}, err
	} else {
		return entryFromReader(r, ur)
	}
}

func entryFromReader(r io.Reader, ur *url.URL) (Entry, error) {
	if root, err := html.Parse(r); err != nil {
		return Entry{}, err
	} else {
		return entryFromNode(root, ur)
	}
}

func entryFromNode(root *html.Node, ur *url.URL) (Entry, error) {
	ret := Entry{}
	for _, node := range scrape.FindAll(root, func(n *html.Node) bool {
		return root == n.Parent && html.ElementNode == n.Type && atom.Html == n.DataAtom
	}) {
		ret.XmlLang = Lang(scrape.Attr(node, "lang"))
		break
	}

	for _, node := range scrape.FindAll(root, func(n *html.Node) bool {
		return html.ElementNode == n.Type && (atom.Meta == n.DataAtom || atom.Title == n.DataAtom)
	}) {
		strName := scrape.Attr(node, "name")
		strProp := scrape.Attr(node, "property")
		strContent := scrape.Attr(node, "content")
		switch {
		case atom.Title == node.DataAtom:
			ret.Title = HumanText{Body: scrape.Text(node)}

		case "title" == strName:
			ret.Title = HumanText{Body: strContent}

		case "description" == strName:
			ret.Summary = &HumanText{Body: strContent}

		case "author" == strName:
			ret.Authors = append(ret.Authors, Person{Name: strContent})

		case "date" == strName:
			var t time.Time
			var err error
			if t, err = time.Parse(time.RFC3339, strContent); err != nil {
				if t, err = time.ParseInLocation("2006-01-02T15:04:05Z0700", strContent, serverLocation); err != nil {
					if t, err = time.ParseInLocation("2006-01-02T15:04:05", strContent, serverLocation); err != nil {
						//panic(err)
					}
				}
			}
			if err == nil {
				ret.Published = iso8601(t)
			}

		case "keywords" == strName:
			for _, txt := range strings.Split(strContent, ",") {
				if t := strings.Replace(strings.TrimSpace(txt), " ", "_", -1); "" != t {
					ret.Categories = append(ret.Categories, Category{Term: t})
				}
			}

		case "og:title" == strProp:
			ret.Title = HumanText{Body: strContent}

		case "og:description" == strProp:
			ret.Summary = &HumanText{Body: strContent}

		case nil == ret.MediaThumbnail && "og:image" == strProp:
			ret.MediaThumbnail = &MediaThumbnail{Url: Iri(strContent)}
		}
	}
	return ret, nil
}