parse_vedabase.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # Parser Vedabase
  2. from bs4 import BeautifulSoup as bs
  3. import bs4
  4. import requests
  5. import time
  6. from memory import memory
  7. def __get_soup(url) -> bs4.BeautifulSoup:
  8. '''Returns soup from site page'''
  9. while True: # повтор запроса к странице после ожидания, если сервер закрыает соединение
  10. try:
  11. r = requests.get(url)
  12. except requests.exceptions.RequestException:
  13. time.sleep(30)
  14. else:
  15. break
  16. r.encoding = 'utf-8'
  17. return bs(r.text, 'html.parser')
  18. def __formate_verse_number(query_arr: list, attr_num: int, digits: int):
  19. try:
  20. attrs = query_arr[attr_num].split('.')
  21. except IndexError:
  22. return False
  23. if len(attrs) != digits:
  24. return False
  25. for i in range(len(attrs) - 1):
  26. if not attrs[i].isdigit():
  27. return False
  28. verses = attrs[-1].split('-')
  29. if len(verses) > 2 or (len(verses) == 1 and not verses[0].isdigit()):
  30. return False
  31. if len(verses) == 2 and (not verses[0].isdigit() or not verses[1].isdigit()):
  32. return False
  33. return '/'.join(attrs)
  34. def __construct_link(query: str) -> dict:
  35. url_arr = ['https://vedabase.io']
  36. languages = ['en', 'nl', 'ru', 'da', 'et', 'sk', 'es', 'de', 'uk',
  37. 'lt', 'sl', 'fi', 'cs', 'hu', 'fr', 'ko', 'pt-br', 'bg', 'ja', 'zu']
  38. cc_books = ['adi', 'madhya', 'antya']
  39. books = ['bg', 'sb', 'cc']
  40. errors = []
  41. attrs = query.split()
  42. # check query attribs
  43. lang_code = attrs[0].lower()
  44. if lang_code not in languages:
  45. errors.append("Wrong language code.")
  46. else:
  47. url_arr.append(f'{lang_code}/library')
  48. if attrs[1].lower() not in books:
  49. errors.append("Wrong book code. Try one of 'bg', 'sb', 'cc'")
  50. else:
  51. try:
  52. cc_volume = attrs[2].lower()
  53. except IndexError:
  54. cc_volume = 'error'
  55. if attrs[1].lower() == 'cc' and cc_volume not in cc_books:
  56. errors.append('Wrong volume code of CC')
  57. elif attrs[1].lower() == 'cc' and attrs[2].lower() in cc_books:
  58. verse_link = __formate_verse_number(attrs, 3, 2)
  59. if verse_link:
  60. url_arr.append(f'cc/{attrs[2].lower()}/{verse_link}')
  61. else:
  62. errors.append('Wrong chapter or verse of CC (Example: 5.25)')
  63. elif attrs[1].lower() == 'sb':
  64. verse_link = __formate_verse_number(attrs, 2, 3)
  65. if verse_link:
  66. url_arr.append(f'sb/{verse_link}')
  67. else:
  68. errors.append('Wrong chapter or verse of SB (Example: 1.5.25)')
  69. else:
  70. verse_link = __formate_verse_number(attrs, 2, 2)
  71. if verse_link:
  72. url_arr.append(f'bg/{verse_link}')
  73. else:
  74. errors.append('Wrong chapter or verse of BG (Example: 5.25)')
  75. if errors:
  76. return {'errors': errors}
  77. else:
  78. return {'link': '/'.join(url_arr), 'attrs': attrs}
  79. def __formate_purport_paragraph(div):
  80. if 'r-verse-text' in div['class']:
  81. return div.get_text('\n', strip=True)
  82. else:
  83. return div.get_text()
  84. # getting data
  85. def get_full_verse(query: str) -> dict:
  86. parsed_query = __construct_link(query)
  87. if 'errors' in parsed_query:
  88. return {'errors': parsed_query['errors']}
  89. url = parsed_query['link']
  90. soup = __get_soup(url)
  91. title = soup.title.text
  92. if title == '':
  93. return {'errors': [f'Verse not found. Maybe this verse in the block of verses. Example: bg 1.16-18\nhttps://vedabase.io/en/library/bg/1/16-18/']}
  94. try:
  95. verse_text = soup.find(
  96. 'div', class_='wrapper-verse-text').find_all(class_='r-verse-text')
  97. verse_text = '\n\n'.join([v.get_text('\n', strip=True)
  98. for v in verse_text])
  99. word_by_word = soup.find(class_='r-synonyms').get_text()
  100. translation = soup.find(class_='r-translation').get_text()
  101. purport_block = soup.find('div', class_='wrapper-puport')
  102. except Exception:
  103. return {'errors': ['Page not found']}
  104. if purport_block:
  105. purport_text = title + ' Purport\n\n' + '\n\n'.join([__formate_purport_paragraph(
  106. div) for div in purport_block.find_all('div')])
  107. purport_id = '_'.join(parsed_query['attrs']).replace(
  108. '.', '_').replace('-', '_')
  109. memory.set(purport_id, purport_text, ex=600)
  110. else:
  111. purport_id = ''
  112. return {
  113. 'link': url,
  114. 'title': title,
  115. 'verse': verse_text,
  116. 'word-by-word': word_by_word,
  117. 'translation': translation,
  118. 'purport_id': purport_id
  119. }