convert_to_csv.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #!/usr/bin/env python
  2. from bs4 import BeautifulSoup as bs
  3. import csv
  4. # from fuzzywuzzy import fuzz
  5. import glob
  6. from pprint import pprint
  7. import re
  8. import calendar
  9. import phonenumbers
  10. DIR_DATA = 'data/'
  11. PATTERN_BIRTHDAY = re.compile(r'(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}')
  12. FIELDNAMES = ['Name','Given Name','Additional Name','Family Name','Yomi Name','Given Name Yomi','Additional Name Yomi','Family Name Yomi','Name Prefix','Name Suffix','Initials','Nickname','Short Name','Maiden Name','Birthday','Gender','Location','Billing Information','Directory Server','Mileage','Occupation','Hobby','Sensitivity','Priority','Subject','Notes','Language','Photo','Group Membership','E-mail 1 - Type','E-mail 1 - Value','IM 1 - Type','IM 1 - Service','IM 1 - Value','Website 1 - Type','Website 1 - Value','Phone 1 - Type','Phone 1 - Value']
  13. list_files = glob.glob(DIR_DATA + 'about_*.html')
  14. list_of_things = set()
  15. with open('from_facebook.csv', 'w', newline='') as csvfile:
  16. writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
  17. writer.writeheader()
  18. for file in list_files:
  19. # print(file)
  20. with open(file) as f:
  21. file_data = f.read()
  22. soup = bs(file_data, 'lxml')
  23. # Look for files with incorrect content
  24. match = (soup.title.text == "You Can't Use This Feature Right Now") or \
  25. (soup.title.text == "Content Not Found") or \
  26. (soup.title.text == "Error Facebook")
  27. if match:
  28. print(file, 'is wrong. Skipping.')
  29. print()
  30. continue
  31. # Name
  32. match = soup.select_one('div span div span strong')
  33. # name_full = match.get_text()
  34. name_full = ''.join(text for text in match.find_all(text=True) if text.parent.name != 'span')
  35. # Alternate Name
  36. match_1 = match.select_one('.alternate_name')
  37. name_alternate = None
  38. if match_1:
  39. name_alternate = match_1.get_text()[1:-1]
  40. # Birthday
  41. birthday = None
  42. birthday_google = None
  43. match = soup.find(text='Birthday')
  44. if match:
  45. match_1 = match.parent.parent.parent.next_sibling.get_text()
  46. if match_1:
  47. match_2 = PATTERN_BIRTHDAY.search(match_1)
  48. if match_2:
  49. year = None
  50. month = match_2['month']
  51. day = match_2['day']
  52. if match_2.group(3):
  53. year = match_2['year']
  54. num_month = format(list(calendar.month_name).index(month), '02d')
  55. num_day = day.zfill(2)
  56. if year:
  57. birthday = year + num_month + num_day
  58. birthday_google = num_month + '/' + num_day + '/' + year
  59. else:
  60. birthday = '--' + num_month + num_day
  61. birthday_google = num_month + '/' + num_day
  62. # Gender
  63. gender = None
  64. match = soup.find(text='Gender')
  65. if match:
  66. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  67. if match_1:
  68. gender = match_1
  69. # Email
  70. list_email = list()
  71. matches = soup.find_all(text='Email')
  72. if matches:
  73. for match in matches:
  74. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  75. if match_1:
  76. list_email.append(match_1)
  77. # Adress
  78. relationship = None
  79. match = soup.find(text='Relationship')
  80. if match:
  81. match_1 = match.parent.parent.parent.parent.parent.parent.next_sibling.get_text(strip=True)
  82. if match_1:
  83. relationship = match_1
  84. # Relationship
  85. address = None
  86. match = soup.find(text='Address')
  87. if match:
  88. match_1 = match.parent.parent.parent.next_sibling.get_text()
  89. if match_1:
  90. address = match_1
  91. # facebook link
  92. link_facebook = None
  93. match = soup.find(text='Facebook')
  94. if match:
  95. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  96. if match_1:
  97. link_facebook = match_1
  98. # instagram link
  99. link_instagram = None
  100. match = soup.find(text='Instagram')
  101. if match:
  102. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  103. if match_1:
  104. link_instagram = match_1
  105. # YouTube link
  106. link_youtube = None
  107. match = soup.find(text='YouTube')
  108. if match:
  109. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  110. if match_1:
  111. link_youtube = match_1
  112. # Twitter link
  113. link_twitter = None
  114. match = soup.find(text='Twitter')
  115. if match:
  116. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  117. if match_1:
  118. link_twitter = match_1
  119. # Generic website link
  120. list_link_websites = list()
  121. matches = soup.find_all(text='Websites')
  122. if matches:
  123. for match in matches:
  124. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  125. if match_1:
  126. list_link_websites.append(match_1)
  127. # Other website link
  128. link_other = None
  129. match = soup.find(text='Other Service')
  130. if match:
  131. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  132. if match_1:
  133. link_other = match_1
  134. # Tumblr link
  135. link_tumblr = None
  136. match = soup.find(text='Tumblr')
  137. if match:
  138. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  139. if match_1:
  140. link_tumblr = match_1
  141. # Snapchat
  142. social_snapchat = None
  143. match = soup.find(text='Snapchat')
  144. if match:
  145. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  146. if match_1:
  147. social_snapchat = match_1
  148. # eBuddy
  149. social_ebuddy = None
  150. match = soup.find(text='eBuddy')
  151. if match:
  152. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  153. if match_1:
  154. social_ebuddy = match_1
  155. # LINE
  156. social_line = None
  157. match = soup.find(text='LINE')
  158. if match:
  159. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  160. if match_1:
  161. social_line = match_1
  162. # Skype
  163. social_skype = None
  164. match = soup.find(text='Skype')
  165. if match:
  166. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  167. if match_1:
  168. social_skype = match_1
  169. # Current City
  170. city_current = None
  171. match = soup.find(text='Current City')
  172. if match:
  173. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  174. if match_1:
  175. city_current = match_1
  176. # Hometown
  177. city_home = None
  178. match = soup.find(text='Hometown')
  179. if match:
  180. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  181. if match_1:
  182. city_home = match_1
  183. # Mobile
  184. number_mobile = None
  185. match = soup.find(text='Mobile')
  186. if match:
  187. match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
  188. if match_1:
  189. number_parsed = phonenumbers.parse(match_1, 'IN')
  190. number_rfc3966 = phonenumbers.format_number(number_parsed, phonenumbers.PhoneNumberFormat.RFC3966)
  191. number_mobile = number_rfc3966
  192. # # Print All
  193. # print(name_full)
  194. # if name_alternate:
  195. # print(name_alternate)
  196. # if birthday:
  197. # print(birthday)
  198. # if gender:
  199. # print(gender)
  200. # if list_email:
  201. # for email in list_email:
  202. # print(email)
  203. # if relationship:
  204. # print(relationship)
  205. # if address:
  206. # print(address)
  207. # if link_facebook:
  208. # print(link_facebook)
  209. # if link_instagram:
  210. # print(link_instagram)
  211. # if link_youtube:
  212. # print(link_youtube)
  213. # if link_twitter:
  214. # print(link_twitter)
  215. # if list_link_websites:
  216. # for link in list_link_websites:
  217. # print(link)
  218. # if link_other:
  219. # print(link_other)
  220. # if link_tumblr:
  221. # print(link_tumblr)
  222. # if social_snapchat:
  223. # print(social_snapchat)
  224. # if social_ebuddy:
  225. # print(social_ebuddy)
  226. # if social_line:
  227. # print(social_line)
  228. # if social_skype:
  229. # print(social_skype)
  230. # if city_current:
  231. # print(city_current)
  232. # if city_home:
  233. # print(city_home)
  234. # if number_mobile:
  235. # print(number_mobile)
  236. # Save to CSV
  237. with open('from_facebook.csv', 'a', newline='') as csvfile:
  238. writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
  239. csv_dict = dict()
  240. csv_dict['Name'] = name_full
  241. if birthday:
  242. csv_dict['Birthday'] = birthday
  243. if gender:
  244. csv_dict['Gender'] = gender
  245. if list_email:
  246. csv_dict['E-mail 1 - Value'] = email
  247. if number_mobile:
  248. csv_dict['Phone 1 - Value'] = number_mobile
  249. writer.writerow(csv_dict)
  250. # # Output Contact Info Types
  251. # match = soup.select('#contact-info > div > div:nth-of-type(2) table tr td div span')
  252. # print(name_full, match)
  253. # for thing in match:
  254. # list_of_things.add(thing.get_text())
  255. # pprint(list_of_things)