babynames.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #!/usr/bin/env python3
  2. # Copyright 2010 Google Inc.
  3. # Licensed under the Apache License, Version 2.0
  4. # http://www.apache.org/licenses/LICENSE-2.0
  5. # Google's Python Class
  6. # http://code.google.com/edu/languages/google-python-class/
  7. import sys
  8. import re
  9. from typing import List
  10. """
  11. A. :
  12. 1) extract_names() OK
  13. 2) Если имя уже есть в списке - выдать то,
  14. что с меньшей цифрой
  15. 3) В main распечатывать возвращаемый список
  16. через text = '\n'.join(mylist) + '\n'
  17. чтобы результат:
  18. 2006
  19. Aaliyah 91
  20. Aaron 57
  21. Abagail 895
  22. Abbey 695
  23. Abbie 650
  24. ...
  25. 4) Сделать возможность принимать несколько файлов
  26. B. :
  27. 1) Если --summaryfile записать вывод для каждого foo.html
  28. в соответствующий foo.html.summary
  29. """
  30. def extract_names(filename: str) -> List[str]:
  31. """
  32. ['2006', 'Aaliyah 91', Aaron 57', 'Abagail 895', ' ...]
  33. """
  34. xlist = []
  35. tmp_list = []
  36. year = ""
  37. with open(filename, 'r') as file:
  38. #print("RR_F")
  39. for line in file:
  40. if 'size="4" value="' in line:
  41. year = re.search(r'\d{4}', line)
  42. #print(year.group())
  43. if year.group() == None:
  44. continue
  45. else:
  46. xlist.append(year.group())
  47. elif '><td>' in line:
  48. #rangNameList = re.findall(r'<td>\d*</td><td>\w*</td><td>\w*', line)
  49. names_line = re.search(r'<td>\d*</td><td>\w*</td><td>\w*', line)
  50. if names_line == None:
  51. continue
  52. else:
  53. # Без такого блока получаю broken pipe, если много в терминал плюю
  54. rank = None
  55. rank_male_name = None
  56. male_name_string = None
  57. male_name = None
  58. rank_fem_name = None
  59. fem_name_string = None
  60. fem_name_search = None
  61. fem_name = None
  62. #print(type(names_line.group()))
  63. rank = re.search(r'[0-9]{1,}', names_line.group()).group()
  64. #print(rank)
  65. male_name_string = re.search(re.escape(">") + r'[A-z]+' + re.escape("<"), names_line.group()).group()
  66. male_name = re.search(r'[A-z]+', male_name_string).group()
  67. #print("M name: ", male_name)
  68. #print(type(names_line.group()[::-1]))
  69. fem_name_string = re.search(re.escape("<") + r'[A-z]+' + re.escape(">"), names_line.group()[::-1]).group()
  70. fem_name = re.search(r'[A-z]+', fem_name_string).group()
  71. #print("F name: ", fem_name[::-1])
  72. rank_male_name = male_name + " " + rank
  73. rank_fem_name = fem_name[::-1] + " " + rank
  74. #print("F rank name: ", rank_fem_name)
  75. for namesPlusRank in tmp_list:
  76. fem_name_search = fem_name[::-1]
  77. if re.match(re.escape(male_name) + r' [0-9]+', namesPlusRank):
  78. #print("detected mal: ", male_name)
  79. if int(rank) <= int(re.search(r'[0-9]+', namesPlusRank).group()):
  80. tmp_list.remove(namesPlusRank)
  81. else:
  82. rank_male_name = namesPlusRank
  83. tmp_list.remove(namesPlusRank)
  84. elif re.match(re.escape(fem_name_search) + r' [0-9]+', namesPlusRank):
  85. #print("detected fem: ", fem_name_search)
  86. if int(rank) <= int(re.search(r'[0-9]+', namesPlusRank).group()):
  87. tmp_list.remove(namesPlusRank)
  88. else:
  89. rank_fem_name = namesPlusRank
  90. tmp_list.remove(namesPlusRank)
  91. #print("append block: ", rank_male_name)
  92. #tmp_list.remove(rank_male_name)
  93. tmp_list.append(rank_male_name)
  94. #male_name = None
  95. #tmp_list.remove(rank_fem_name)
  96. tmp_list.append(rank_fem_name)
  97. #fem_name = None
  98. #tmp_list.append(rangNameList[3]+" "+rangNameList[0])
  99. #str(list(filter(re.compile('\w{2,}').match, rangNameList))[0])+" "+
  100. #print(tmp_list)
  101. tmp_list.sort()
  102. #print(tmp_list)
  103. xlist.extend(tmp_list)
  104. #print(xlist)
  105. return xlist
  106. def main():
  107. # This command-line parsing code is provided.
  108. # Make a list of command line arguments, omitting the [0] element
  109. # which is the script itself.
  110. args = sys.argv[1:]
  111. if not args:
  112. print('usage: [--summaryfile] file [file ...]')
  113. sys.exit(1)
  114. # Notice the summary flag and remove it from args if it is present.
  115. summary = False
  116. if args[0] == '--summaryfile':
  117. summary = True
  118. del args[0]
  119. #print(len(args))
  120. for i in args:
  121. if summary:
  122. print("summary")
  123. else:
  124. #extract_names(i)
  125. #print(extract_names(i))
  126. print('\n'.join(extract_names(i)) + '\n')
  127. # +++your code here+++
  128. # For each filename, get the names, then either print the text output
  129. # or write it to a summary file
  130. if __name__ == '__main__':
  131. main()