12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- #!/usr/bin/env python3
- import datetime
- import json
- import re
- import bs4
- import requests
- print('Content-Type: application/json')
- print()
- output = []
- urls = set()
- sources = [ 'http://www.espncricinfo.com/ci/content/story/news.html', 'http://www.espncricinfo.com/ci/content/story/genre.html?genre=2', 'http://www.espncricinfo.com/ci/content/story/genre.html?genre=3' ]
- articleLists = [[] for _ in sources]
- for i, source in enumerate(sources):
- while True:
- request = requests.get(source)
- if not 400 <= request.status_code < 600:
- break
-
- soup = bs4.BeautifulSoup(request.text, 'html.parser')
-
- for article in soup.find_all("article", class_="story-item"):
- articleLists[i].append({
- 'title': article.find('h2', class_='story-title').get_text(),
- 'image': re.sub("[.]2[.]([^.]+)$", ".\\1", re.sub("[.]5[.]([^.]+)$", ".4.\\1", re.sub("[.]4[.]([^.]+)$", ".3.\\1", article.find('figure', class_='story-img').img['src'].replace(".icon.", ".")))),
- 'url': 'http://www.espncricinfo.com' + article.find('h2', class_='story-title').a['href'],
- 'description': article.find('strong', class_='story-date').next_sibling.strip(),
- 'date': (datetime.datetime.strptime(article.find('strong', class_='story-date').get_text().strip(), "%b %d, %Y") - datetime.datetime.fromtimestamp(0)).total_seconds() * 1000,
- 'author': article.find('div', class_='author').get_text().strip()
- })
- imageParentTag = article.find('figure', class_='story-img')
- if imageParentTag:
- articleLists[i][-1]['image'] = re.sub("[.]2[.]([^.]+)$", ".\\1", re.sub("[.]5[.]([^.]+)$", ".4.\\1", re.sub("[.]4[.]([^.]+)$", ".3.\\1", imageParentTag.img['src'].replace(".icon.", "."))))
-
- output = []
- while len(articleLists) > 0 and len(output) < 15:
- article = sorted([next(b for b in a if b['url'] not in urls) for a in articleLists], key=lambda a: int(re.search('/([0-9]+)[.]html$', a["url"]).groups()[0]), reverse=True)[0]
- urls |= { article['url'] }
- output.append(article)
- for _list in articleLists:
- if article in _list:
- _list.remove(article)
- if not any(a['url'] not in urls for a in _list):
- articleLists.remove(_list)
- break
-
- print(json.dumps(output))
|