news.json 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #!/usr/bin/env python3
  2. import datetime
  3. import json
  4. import re
  5. import bs4
  6. import requests
  7. print('Content-Type: application/json')
  8. print()
  9. output = []
  10. urls = set()
  11. sources = [ 'http://www.espncricinfo.com/ci/content/story/news.html', 'http://www.espncricinfo.com/ci/content/story/genre.html?genre=2', 'http://www.espncricinfo.com/ci/content/story/genre.html?genre=3' ]
  12. articleLists = [[] for _ in sources]
  13. for i, source in enumerate(sources):
  14. while True:
  15. request = requests.get(source)
  16. if not 400 <= request.status_code < 600:
  17. break
  18. soup = bs4.BeautifulSoup(request.text, 'html.parser')
  19. for article in soup.find_all("article", class_="story-item"):
  20. articleLists[i].append({
  21. 'title': article.find('h2', class_='story-title').get_text(),
  22. 'image': re.sub("[.]2[.]([^.]+)$", ".\\1", re.sub("[.]5[.]([^.]+)$", ".4.\\1", re.sub("[.]4[.]([^.]+)$", ".3.\\1", article.find('figure', class_='story-img').img['src'].replace(".icon.", ".")))),
  23. 'url': 'http://www.espncricinfo.com' + article.find('h2', class_='story-title').a['href'],
  24. 'description': article.find('strong', class_='story-date').next_sibling.strip(),
  25. 'date': (datetime.datetime.strptime(article.find('strong', class_='story-date').get_text().strip(), "%b %d, %Y") - datetime.datetime.fromtimestamp(0)).total_seconds() * 1000,
  26. 'author': article.find('div', class_='author').get_text().strip()
  27. })
  28. imageParentTag = article.find('figure', class_='story-img')
  29. if imageParentTag:
  30. articleLists[i][-1]['image'] = re.sub("[.]2[.]([^.]+)$", ".\\1", re.sub("[.]5[.]([^.]+)$", ".4.\\1", re.sub("[.]4[.]([^.]+)$", ".3.\\1", imageParentTag.img['src'].replace(".icon.", "."))))
  31. output = []
  32. while len(articleLists) > 0 and len(output) < 15:
  33. article = sorted([next(b for b in a if b['url'] not in urls) for a in articleLists], key=lambda a: int(re.search('/([0-9]+)[.]html$', a["url"]).groups()[0]), reverse=True)[0]
  34. urls |= { article['url'] }
  35. output.append(article)
  36. for _list in articleLists:
  37. if article in _list:
  38. _list.remove(article)
  39. if not any(a['url'] not in urls for a in _list):
  40. articleLists.remove(_list)
  41. break
  42. print(json.dumps(output))