12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- """
- Digg (News, Social media)
- @website https://digg.com/
- @provide-api no
- @using-api no
- @results HTML (using search portal)
- @stable no (HTML can change)
- @parse url, title, content, publishedDate, thumbnail
- """
- from urllib import quote_plus
- from json import loads
- from lxml import html
- from cgi import escape
- from dateutil import parser
- # engine dependent config
- categories = ['news', 'social media']
- paging = True
- # search-url
- base_url = 'https://digg.com/'
- search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
- # specific xpath variables
- results_xpath = '//article'
- link_xpath = './/small[@class="time"]//a'
- title_xpath = './/h2//a//text()'
- content_xpath = './/p//text()'
- pubdate_xpath = './/time'
- # do search-request
- def request(query, params):
- offset = (params['pageno'] - 1) * 10
- params['url'] = search_url.format(position=offset,
- query=quote_plus(query))
- return params
- # get response from search-request
- def response(resp):
- results = []
- search_result = loads(resp.text)
- if 'html' not in search_result or search_result['html'] == '':
- return results
- dom = html.fromstring(search_result['html'])
- # parse results
- for result in dom.xpath(results_xpath):
- url = result.attrib.get('data-contenturl')
- thumbnail = result.xpath('.//img')[0].attrib.get('src')
- title = ''.join(result.xpath(title_xpath))
- content = escape(''.join(result.xpath(content_xpath)))
- pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
- publishedDate = parser.parse(pubdate)
- # http to https
- thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content,
- 'template': 'videos.html',
- 'publishedDate': publishedDate,
- 'thumbnail': thumbnail})
- # return results
- return results
|