email-prune/email_pruner.py

85 lines
2.7 KiB
Python

from argparse import ArgumentParser as ap
from datetime import timedelta
from secrets import choice
from string import ascii_letters
from timeit import default_timer as timer
def reset_stopwatch():
return timer()
def get_elapsed(starttime):
end = timer()
return timedelta(seconds=end - starttime)
def randstring(strlen=64):
return ''.join(choice(ascii_letters) for _ in range(strlen))
def spawn(listlen=100):
base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]
dup_list = [choice(base_list) for _ in range(len(base_list))]
final_list = []
for i in range(listlen):
final_list.append(base_list[i])
final_list.append(dup_list[i])
return final_list
def dups(biglist):
seen = set()
uneek = []
for x in biglist:
if x not in seen:
uneek.append(x)
seen.add(x)
return list(seen), uneek
# NOTE:
# In the event that you do not need both lists,
# there is a much simpler, more "pythonic", way
# to do the pruning with python:
def prune(biglist):
return list(dict.fromkeys(biglist))
if __name__ == "__main__":
parser = ap()
parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",
help='The number of emails to generate (default=1000)', required=False)
args = parser.parse_args()
email_count = args.emails
# NOTE: The spawning process takes an enormous amount of time,
# but since the challenge didn't say anything about how long it takes to
# generate 100,000 emails (only how long it takes to de-dupe them), I
# didn't do much to try to optimize the creation of the list of emails.
# But I will say, that I kept it entirely in memory, to avoid having to
# deal with disk i/o.
start = reset_stopwatch()
list_with_dups = spawn(email_count)
print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")
# [print(i) for i in list_with_dups]
t1 = get_elapsed(start)
print("Elapsed Time: ", t1)
# This is the part we really care about. This step takes the generated list,
# and runs it through the de-duplicator, returning two lists: the originals,
# and the duplicates. Note, that these lists are identical in LENGTH ONLY,
# because the bifurcation process leaves them unsorted, according to the
# requirements. If sorted, they could be shown to be identical in content
# as well.
start = reset_stopwatch()
dup_list, orig_list = dups(list_with_dups)
print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")
# [print(i) for i in dup_list]
t2 = get_elapsed(start)
print("Elapsed time: ", t2)
print("\n\n")
print(f"TOTAL ELAPSED TIME: {t1 + t2}")