pokemon-news-rss.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/usr/bin/env python
  2. import requests
  3. import re
  4. import urllib.parse
  5. import datetime
  6. HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121201 icecat/17.0.1', "Content-Type": "text/html;charset=UTF-8"}
  7. def scrape():
  8. data = requests.get("https://www.pokemon.com/us/pokemon-news", headers=HEADERS).text
  9. #with open("index.html", "w") as f:
  10. #f.write(data)
  11. dates = re.findall('<p class="date">(.*?)</p>', data)
  12. titles = re.findall("<h3>(.*?)</h3>", data)[2:]
  13. descriptions = re.findall('<p>|<p class="hidden-mobile">(.*?)</p>', data)
  14. descriptions = [x for x in descriptions if x]
  15. linkers = re.findall(f'<a href="(.*?)">', data)
  16. links = []
  17. images = re.findall(f'<img src="/static-assets(.*?)"',data)[2:]
  18. for link in linkers:
  19. if 'rel="" ' in link:
  20. pass
  21. elif "?article" in link:
  22. pass
  23. elif link.startswith(("/us/pokemon-news/","/us/strategy/")):
  24. links.append(link)
  25. links = links[2:]
  26. for (desc, title, date, link, image) in zip(descriptions, titles, dates, links, images):
  27. print(f"""<item>
  28. <title>{title}</title>
  29. <link>{"https://pokemon.com/"+link}</link>
  30. <pubdate>{datetime.datetime.strptime(date, '%B %d, %Y').strftime('%a, %d %b %Y')}</pubdate>
  31. <description><![CDATA[<img src="https://pokemon.com/static-assets/{image}" alt="{title}">
  32. {desc}]]></description>
  33. </item>""")
  34. def api():
  35. data = requests.get("https://www.pokemon.com/api/1/us/news/get-news.json").json()
  36. for article in data:
  37. print(f"""<item>
  38. <title>{article['title']}</title>
  39. <link>{"https://pokemon.com"+article['url']}</link>
  40. <pubdate>{datetime.datetime.strptime(article['date'], '%B %d, %Y').strftime('%a, %d %b %Y')}</pubdate>
  41. <description><![CDATA[<img src="https://pokemon.com{article['image']}" alt="{article['alt']}">
  42. {article['shortDescription']}]]></description>
  43. </item>""")
  44. if __name__ == "__main__":
  45. print('''<rss version="2.0">
  46. <channel>
  47. <title>Pokemon News</title>
  48. <link>https://www.pokemon.com/us/pokemon-news/</link>
  49. <description>RSS for Pokemon News articles</description>
  50. ''')
  51. api()
  52. print("""
  53. </channel>
  54. </rss>""")