email_stats.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/python
  2. # coding: utf-8
  3. from __future__ import print_function
  4. from __future__ import absolute_import
  5. from __future__ import division
  6. from __future__ import unicode_literals
  7. import mailbox
  8. import email
  9. import os
  10. import time
  11. from collections import Counter, defaultdict
  12. class Event(object):
  13. """
  14. Store timespans in which the only sender seen in a mailbox was 'addr'
  15. """
  16. def __init__(self, addr, ts):
  17. self.addr = addr
  18. self.begin = ts
  19. self.until = ts
  20. def print(self, file=None):
  21. if self.begin != self.until:
  22. print("{}: {}-{}".format(self.addr, self.begin, self.until), file=file)
  23. else:
  24. print("{}: {}".format(self.addr, self.begin), file=file)
  25. class Timeline(object):
  26. """
  27. Store consecutive events
  28. """
  29. def __init__(self):
  30. self.events = []
  31. def add(self, ts, addr):
  32. if not self.events or self.events[-1].addr != addr:
  33. self.events.append(Event(addr, ts))
  34. else:
  35. self.events[-1].until = ts
  36. def gap_lengths(self, ts_now=None):
  37. if ts_now is None: ts_now = time.time()
  38. for idx, evt in enumerate(self.events):
  39. if idx < len(self.events) - 1:
  40. ts_next = self.events[idx + 1].begin
  41. else:
  42. ts_next = ts_now
  43. yield evt.addr, ts_next - evt.until
  44. def print(self, file=None):
  45. for e in self.events:
  46. e.print(file=file)
  47. def read_mbox(pathname):
  48. """
  49. Parse a mailbox and return a sequence of:
  50. (timestamp, sender email address, sender real name)
  51. """
  52. mbox = mailbox.mbox(pathname)
  53. for msg in mbox:
  54. realname, addr = email.utils.parseaddr(msg["From"])
  55. parsed = email.utils.parsedate_tz(msg["Date"])
  56. if parsed is None:
  57. ts = None
  58. else:
  59. ts = email.utils.mktime_tz(parsed)
  60. yield ts, addr, realname
  61. def aggregate(parsed_mbox):
  62. """
  63. Given (timestamp, address, realname) tuples, generate the same tuples
  64. replacing address with the most common address among those used by people
  65. with the same realname
  66. """
  67. # Convert into a list so we can iterate it twice
  68. parsed_mbox = list(parsed_mbox)
  69. # Map realnames to set of email addresses with their occurrence count
  70. by_realname = defaultdict(Counter)
  71. for ts, addr, realname in parsed_mbox:
  72. if realname.endswith(" via nm"): realname = realname[:-7]
  73. by_realname[realname][addr] += 1
  74. # Compute the most common address for a realname
  75. aliases = {}
  76. for c in by_realname.itervalues():
  77. ranked = c.most_common()
  78. for addr, count in ranked:
  79. aliases[addr] = ranked[0][0]
  80. for ts, addr, relname in parsed_mbox:
  81. yield ts, aliases[addr], relname
  82. def filter_top2(parsed_mbox):
  83. """
  84. Given (timestamp, address, realname) tuples, generate only those of the two
  85. most common addresses
  86. """
  87. # Convert into a list so we can iterate it twice
  88. parsed_mbox = list(parsed_mbox)
  89. # Count how many time addresses appear
  90. addr_count = Counter()
  91. for ts, addr, realname in parsed_mbox:
  92. addr_count[addr] += 1
  93. whitelist = { x[0] for x in addr_count.most_common(2) }
  94. for ts, addr, realname in parsed_mbox:
  95. if addr not in whitelist: continue
  96. yield ts, addr, realname
  97. def mailbox_get_gaps(pathname):
  98. """
  99. Compute waiting gaps for a mailbox
  100. """
  101. timeline = Timeline()
  102. for ts, addr, realname in sorted(filter_top2(aggregate(read_mbox(pathname)))):
  103. timeline.add(ts, addr)
  104. return timeline.gap_lengths()
  105. #for f in os.listdir("."):
  106. # if not f.endswith(".mbox"): continue
  107. # print(" *", f)
  108. # #for ts, addr, realname in aggregate(read_mbox(f)):
  109. # # print(ts, addr, realname)
  110. # timeline = Timeline()
  111. # for ts, addr, realname in sorted(filter_top2(aggregate(read_mbox(f)))):
  112. # timeline.add(ts, addr)
  113. # byaddr = {}
  114. # for addr, length in timeline.gap_lengths():
  115. # byaddr.setdefault(addr, []).append(length)
  116. # print(addr, length)
  117. # for addr, lengths in byaddr.iteritems():
  118. # print("Avg wtime for {}: {}".format(addr, sum(lengths)/len(lengths)))
  119. #
  120. # #timeline.print()