scribd.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334
  1. #!/usr/bin/env python
  2. # FORK OF: https://github.com/vaginessa/Scribd-Downloader/
  3. # Python 2 to Python 3 and making it work for my usecase
  4. from bs4 import BeautifulSoup
  5. import requests
  6. import sys
  7. import re
  8. import os
  9. response = requests.request(method='GET', url=sys.argv[1])
  10. with open("index.html", "w", encoding="utf-8") as f:
  11. f.write(response.text)
  12. soup = BeautifulSoup(response.text, 'html.parser')
  13. extraction = ''
  14. train = 1
  15. title = soup.find("div", {"class": "auto__app_page_body_metadata_original_title data_row original_title inline"}).get_text().split(":")[1].replace("&", "").replace("_","")
  16. js_text = soup.find('script', type='text/javascript')
  17. print(js_text)
  18. for opening in js_text:
  19. for inner_opening in opening:
  20. urls = re.findall("contentUrl\: \"(.*?)\"", inner_opening)
  21. if not urls == '':
  22. for url in urls:
  23. replacement = url.replace('/pages/', '/images/').replace('jsonp', 'jpg')
  24. print(replacement)
  25. #print replacement
  26. print('Downloading page ' + str(train))
  27. #response = requests.get(replacement, stream=True)
  28. #with open(str(train) + '.jpg', 'wb') as out_file:
  29. # out_file.write(response.content)
  30. train+=1