123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168 |
- #!/usr/bin/env python3
- # Copyright 2010 Google Inc.
- # Licensed under the Apache License, Version 2.0
- # http://www.apache.org/licenses/LICENSE-2.0
- # Google's Python Class
- # http://code.google.com/edu/languages/google-python-class/
- import sys
- import re
- from typing import List
- """
- A. :
- 1) extract_names() OK
- 2) Если имя уже есть в списке - выдать то,
- что с меньшей цифрой
- 3) В main распечатывать возвращаемый список
- через text = '\n'.join(mylist) + '\n'
- чтобы результат:
- 2006
- Aaliyah 91
- Aaron 57
- Abagail 895
- Abbey 695
- Abbie 650
- ...
- 4) Сделать возможность принимать несколько файлов
- B. :
- 1) Если --summaryfile записать вывод для каждого foo.html
- в соответствующий foo.html.summary
- """
- def extract_names(filename: str) -> List[str]:
- """
- ['2006', 'Aaliyah 91', Aaron 57', 'Abagail 895', ' ...]
- """
- xlist = []
- tmp_list = []
- year = ""
- with open(filename, 'r') as file:
- #print("RR_F")
- for line in file:
- if 'size="4" value="' in line:
- year = re.search(r'\d{4}', line)
- #print(year.group())
- if year.group() == None:
- continue
- else:
- xlist.append(year.group())
- elif '><td>' in line:
- #rangNameList = re.findall(r'<td>\d*</td><td>\w*</td><td>\w*', line)
- names_line = re.search(r'<td>\d*</td><td>\w*</td><td>\w*', line)
- if names_line == None:
- continue
- else:
- # Без такого блока получаю broken pipe, если много в терминал плюю
- rank = None
- rank_male_name = None
- male_name_string = None
- male_name = None
- rank_fem_name = None
- fem_name_string = None
- fem_name_search = None
- fem_name = None
- #print(type(names_line.group()))
- rank = re.search(r'[0-9]{1,}', names_line.group()).group()
- #print(rank)
- male_name_string = re.search(re.escape(">") + r'[A-z]+' + re.escape("<"), names_line.group()).group()
- male_name = re.search(r'[A-z]+', male_name_string).group()
- #print("M name: ", male_name)
- #print(type(names_line.group()[::-1]))
- fem_name_string = re.search(re.escape("<") + r'[A-z]+' + re.escape(">"), names_line.group()[::-1]).group()
- fem_name = re.search(r'[A-z]+', fem_name_string).group()
- #print("F name: ", fem_name[::-1])
- rank_male_name = male_name + " " + rank
- rank_fem_name = fem_name[::-1] + " " + rank
- #print("F rank name: ", rank_fem_name)
- for namesPlusRank in tmp_list:
- fem_name_search = fem_name[::-1]
- if re.match(re.escape(male_name) + r' [0-9]+', namesPlusRank):
- #print("detected mal: ", male_name)
- if int(rank) <= int(re.search(r'[0-9]+', namesPlusRank).group()):
- tmp_list.remove(namesPlusRank)
- else:
- rank_male_name = namesPlusRank
- tmp_list.remove(namesPlusRank)
- elif re.match(re.escape(fem_name_search) + r' [0-9]+', namesPlusRank):
- #print("detected fem: ", fem_name_search)
- if int(rank) <= int(re.search(r'[0-9]+', namesPlusRank).group()):
- tmp_list.remove(namesPlusRank)
- else:
- rank_fem_name = namesPlusRank
- tmp_list.remove(namesPlusRank)
- #print("append block: ", rank_male_name)
- #tmp_list.remove(rank_male_name)
- tmp_list.append(rank_male_name)
- #male_name = None
- #tmp_list.remove(rank_fem_name)
- tmp_list.append(rank_fem_name)
- #fem_name = None
- #tmp_list.append(rangNameList[3]+" "+rangNameList[0])
- #str(list(filter(re.compile('\w{2,}').match, rangNameList))[0])+" "+
- #print(tmp_list)
- tmp_list.sort()
- #print(tmp_list)
- xlist.extend(tmp_list)
- #print(xlist)
- return xlist
- def main():
- # This command-line parsing code is provided.
- # Make a list of command line arguments, omitting the [0] element
- # which is the script itself.
- args = sys.argv[1:]
- if not args:
- print('usage: [--summaryfile] file [file ...]')
- sys.exit(1)
- # Notice the summary flag and remove it from args if it is present.
- summary = False
- if args[0] == '--summaryfile':
- summary = True
- del args[0]
- #print(len(args))
- for i in args:
- if summary:
- print("summary")
- else:
- #extract_names(i)
- #print(extract_names(i))
- print('\n'.join(extract_names(i)) + '\n')
- # +++your code here+++
- # For each filename, get the names, then either print the text output
- # or write it to a summary file
- if __name__ == '__main__':
- main()
|