scraper.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. #!/bin/python
  2. """
  3. js interface: http://www.ztm.poznan.pl/themes/ztm/dist/js/app.js
  4. nodes: http://www.ztm.poznan.pl/goeuropa-api/all-nodes
  5. stops in node: http://www.ztm.poznan.pl/goeuropa-api/node_stops/{node:symbol}
  6. stops: http://www.ztm.poznan.pl/goeuropa-api/stops-nodes
  7. bike stations: http://www.ztm.poznan.pl/goeuropa-api/bike-stations
  8. alerts: goeuropa-api/alerts/' + lineId;
  9. """
  10. import json
  11. import os
  12. import re
  13. import sqlite3
  14. import sys
  15. import requests
  16. from bs4 import BeautifulSoup
  17. class TimetableDownloader:
  18. """
  19. downloader class
  20. """
  21. def __init__(self, verbose):
  22. self.session = requests.session()
  23. self.verbose = verbose
  24. def __get_validity(self):
  25. """
  26. get timetable validity
  27. """
  28. index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index')
  29. option = re.search('<option value="[0-9]{8}" selected', index.text).group()
  30. return option.split('"')[1]
  31. def __get_nodes(self):
  32. """
  33. get nodes
  34. """
  35. index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/all-nodes')
  36. return [(stop['symbol'], stop['name']) for stop in json.loads(index.text)]
  37. def __get_stops(self, node):
  38. """
  39. get stops
  40. """
  41. index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/node_stops/{}'.format(node))
  42. stops = []
  43. for stop in json.loads(index.text):
  44. name = stop['stop']['name']
  45. stop_id = stop['stop']['id']
  46. number = re.findall("\\d+", stop['stop']['symbol'])[0]
  47. lat = stop['stop']['lat']
  48. lon = stop['stop']['lon']
  49. # todo test
  50. directions = ', '.join(['{} → {}'.format(transfer['name'], transfer['headsign'])
  51. for transfer in stop['transfers']
  52. if transfer['headsign'] != name])
  53. if directions == '':
  54. continue
  55. stops.append((stop_id, node, number, lat, lon, directions))
  56. return stops
  57. def __get_lines(self):
  58. """
  59. get lines
  60. """
  61. index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index')
  62. soup = BeautifulSoup(index.text, 'html.parser')
  63. lines = {line['data-lineid']: line.text for line in
  64. soup.findAll(attrs={'class': re.compile(r'.*\blineNo-bt\b.*')})}
  65. return lines
  66. def __get_route(self, line_id):
  67. """
  68. get routes
  69. """
  70. index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/line-info/{}'.format(line_id))
  71. soup = BeautifulSoup(index.text, 'html.parser')
  72. directions = soup.findAll(attrs={'class': re.compile(r'.*\baccordion-item\b.*')})
  73. routes = {}
  74. for direction in directions:
  75. direction_id = direction['data-directionid']
  76. route = [{'id': stop.find('a')['data-stopid'], 'name': stop['data-name'],
  77. 'onDemand': re.search('stop-onDemand', str(stop['class'])) != None,
  78. 'variant_type': re.search('variant-type-(in|out)', str(stop['class'])).groups()[0]\
  79. if re.search('variant-type-(in|out)', str(stop['class'])) is not None else None,
  80. 'variant_first': re.search('first-variant', str(stop['class'])) is not None,
  81. 'variant_last': re.search('last-variant', str(stop['class'])) is not None}
  82. for stop in direction.findAll(attrs={'class': re.compile(r'.*\bstop-itm\b.*')})]
  83. routes[direction_id] = route
  84. return routes
  85. def __get_stop_times(self, stop_id, line_id, direction_id):
  86. index = self.__post('https://www.ztm.poznan.pl/goeuropa-api/stop-info/{}/{}'.
  87. format(stop_id, line_id), {'directionId': direction_id})
  88. soup = BeautifulSoup(index.text, 'html.parser')
  89. legends = {}
  90. for row in soup.find(attrs={'class': re.compile(r'.*\blegend-box\b.*')}).findAll('li'):
  91. row = row.text.split('-')
  92. row[0] = row[0].rstrip()
  93. row[1] = row[1].lstrip()
  94. if row[0] != '_':
  95. legends[row[0]] = '-'.join(row[1:])
  96. schedules = {}
  97. for mode in soup.findAll(attrs={'class': re.compile(r'.*\bmode-tab\b.*')}):
  98. mode_name = mode['data-mode']
  99. schedule = {row.find('th').text: [
  100. {'time': minute.text, 'lowFloor': re.search('n-line', str(minute['class'])) != None}
  101. for minute in row.findAll('a')]
  102. for row in mode.find(attrs={'class': re.compile(r'.*\bscheduler-hours\b.*')}).
  103. findAll('tr')}
  104. schedule_2 = {hour: times for hour, times in schedule.items() if times != []}
  105. schedule = []
  106. for hour, deps in schedule_2.items():
  107. for dep in deps:
  108. schedule.append((hour, *self.__describe(dep['time'], legends), dep['lowFloor']))
  109. schedules[mode_name] = schedule
  110. return schedules
  111. @staticmethod
  112. def __describe(dep_time, legend):
  113. """
  114. describe departure
  115. """
  116. desc = []
  117. while re.match('^\\d+$', dep_time) is None:
  118. try:
  119. if dep_time[-1] != ',':
  120. desc.append(legend[dep_time[-1]])
  121. except KeyError:
  122. pass
  123. dep_time = dep_time[:-1]
  124. return (int(dep_time), '; '.join(desc))
  125. def __get(self, url):
  126. try:
  127. return self.session.get(url, verify='bundle.pem')
  128. except:
  129. self.session = requests.session()
  130. return self.session.get(url, verify='bundle.pem')
  131. def __post(self, url, data):
  132. try:
  133. return self.session.post(url, data=data, verify='bundle.pem')
  134. except:
  135. self.session = requests.session()
  136. return self.session.post(url, data=data, verify='bundle.pem')
  137. # todo take into account parent (and for variant stops it needs synced departure times)
  138. @staticmethod
  139. def __calculate_time_to_next_stop(times, last_time_of_arrival):
  140. times.sort()
  141. earliest_departure = times[0]
  142. if last_time_of_arrival == "":
  143. return None, earliest_departure
  144. hour = int(earliest_departure[:2])
  145. minute = int(earliest_departure[3:])
  146. minute = minute + (60 * hour)
  147. last_hour = int(last_time_of_arrival[:2])
  148. last_minute = int(last_time_of_arrival[3:])
  149. last_minute = last_minute + (60 * last_hour)
  150. time_to_next_stop = minute - last_minute
  151. return time_to_next_stop, earliest_departure
  152. def download(self):
  153. """
  154. main function
  155. """
  156. if os.path.exists('timetable.db'):
  157. connection = sqlite3.connect('timetable.db')
  158. cursor = connection.cursor()
  159. cursor.execute("select value from metadata where key = 'validFrom'")
  160. current_valid_from = cursor.fetchone()[0]
  161. cursor.close()
  162. connection.close()
  163. if self.__get_validity() <= current_valid_from:
  164. return 304
  165. else:
  166. os.remove('timetable.db')
  167. with sqlite3.connect('timetable.db') as connection:
  168. try:
  169. cursor = connection.cursor()
  170. cursor.execute('create table metadata(key TEXT PRIMARY KEY, value TEXT)')
  171. cursor.execute('create table nodes(symbol TEXT PRIMARY KEY, name TEXT)')
  172. cursor.execute('create table stops(id TEXT PRIMARY KEY, symbol TEXT \
  173. references node(symbol), number TEXT, lat REAL, lon REAL, \
  174. headsigns TEXT)')
  175. cursor.execute('create table lines(id TEXT PRIMARY KEY, number TEXT)')
  176. cursor.execute('create table timetables(id TEXT PRIMARY KEY, stop_id TEXT \
  177. references stop(id), line_id TEXT references line(id), \
  178. headsign TEXT, parent TEXT references id, \
  179. parent_variant TEXT references id)')
  180. cursor.execute('create table departures(id INTEGER PRIMARY KEY, \
  181. timetable_id TEXT references timetable(id), \
  182. hour INTEGER, minute INTEGER, mode TEXT, \
  183. lowFloor INTEGER, modification TEXT)')
  184. validity = self.__get_validity()
  185. print(validity)
  186. sys.stdout.flush()
  187. cursor.execute("insert into metadata values('validFrom', ?)", (validity,))
  188. nodes = self.__get_nodes()
  189. cursor.executemany('insert into nodes values(?, ?)', nodes)
  190. node_i = 1
  191. for symbol, _ in nodes:
  192. if self.verbose:
  193. print('node {}'.format(node_i))
  194. stops = self.__get_stops(symbol)
  195. cursor.executemany('insert into stops values(?, ?, ?, ?, ?, ?)', stops)
  196. node_i += 1
  197. lines = self.__get_lines()
  198. cursor.executemany('insert into lines values(?, ?)', lines.items())
  199. timetable_id = 1
  200. line_i = 1
  201. for line_id, _ in lines.items():
  202. route = self.__get_route(line_id)
  203. route_i = 1
  204. for direction, stops in route.items():
  205. stop_i = 1
  206. parent_stop = None
  207. parent_stop_variant = None
  208. for stop in stops[:-1]:
  209. if self.verbose:
  210. print("stop {} in route {} in line {}".format(stop_i, route_i, line_i))
  211. timetables = self.__get_stop_times(stop['id'], line_id, direction)
  212. if stop_i == 1 and stop['variant_type'] is None:
  213. if self.verbose:
  214. print('stop1 & main')
  215. parent = None
  216. parent_variant = None
  217. parent_stop = stop['id']
  218. elif stop['variant_type'] == 'in' and stop['variant_first']:
  219. if self.verbose:
  220. print('in & first')
  221. parent = None
  222. parent_variant = None
  223. parent_stop_variant = stop['id']
  224. elif stop_i > 1 and stop['variant_type'] is None:
  225. if self.verbose:
  226. print('stop>1 & main')
  227. parent = parent_stop
  228. parent_variant = parent_stop_variant
  229. parent_stop = stop['id']
  230. parent_stop_variant = None
  231. elif stop['variant_type'] is not None and not stop['variant_first']:
  232. if self.verbose:
  233. print('variant & not first')
  234. parent = None
  235. parent_variant = parent_stop_variant
  236. parent_stop_variant = stop['id']
  237. elif stop['variant_type'] == 'out' and stop['variant_first']:
  238. if self.verbose:
  239. print('out & first')
  240. parent = None
  241. parent_variant = parent_stop
  242. parent_stop_variant = stop['id']
  243. if stop['variant_type'] == 'out' and stop['variant_last']:
  244. parent_stop_variant = None
  245. cursor.execute('insert into timetables values(?, ?, ?, ?, ?, ?)',
  246. (timetable_id, stop['id'], line_id, stops[-1]['name'], parent, parent_variant))
  247. for mode, times in timetables.items():
  248. cursor.executemany('insert into departures values(null, ?, ?, ?, ?, ?, \
  249. ?)', [(timetable_id, hour, minute, mode, lowfloor, desc)
  250. for hour, minute, desc, lowfloor in times])
  251. stop_i += 1
  252. timetable_id += 1
  253. route_i += 1
  254. line_i += 1
  255. except KeyboardInterrupt:
  256. return 404
  257. return 0
  258. if __name__ == '__main__':
  259. verbose = False
  260. try:
  261. if sys.argv[1] == '-v':
  262. verbose = True
  263. except IndexError:
  264. pass
  265. downloader = TimetableDownloader(verbose)
  266. exit(downloader.download())