From c7c439b5ad6b93f48c1706c14e21af2fa50c7356 Mon Sep 17 00:00:00 2001
From: Greg Gauthier <gmgauthier@protonmail.com>
Date: Wed, 21 Oct 2020 23:57:48 +0100
Subject: [PATCH] final commit

---
 .gitignore           |  3 +++
 Pipfile              | 13 ++++++++++
 README.md            | 61 ++++++++++++++++++++++++++++++++++++++++----
 conftest.py          |  7 +++++
 email_pruner.py      | 54 +++++++++++++++++++++++++++------------
 test_email_pruner.py | 59 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 176 insertions(+), 21 deletions(-)
 create mode 100644 Pipfile
 create mode 100644 conftest.py
 create mode 100644 test_email_pruner.py

diff --git a/.gitignore b/.gitignore
index 864ac8e..f587acb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 .idea/
 *.iml
 *.pyc
+__pycache__/
+*.lock
+.pytest_cache/
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..0ac07b6
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = "*"
+
+[packages]
+pytest = "*"
+
+[requires]
+python_version = "3.8"
\ No newline at end of file
diff --git a/README.md b/README.md
index 5716096..50ec4a5 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,60 @@
-# email-prune
-automattic coding challenge solution
+# email-pruner
+### automattic coding challenge solution
 
-author: Greg Gauthier
+* author: Greg Gauthier
+* job application: Senior Quality Engineer 
+* date: 21 October 2020
 
-application: Senior Quality Engineer 
+#### Requirements
+* Python 3.7+ (needed for the "fprint" statements)
+* Pipenv ~2020.8.13 (for virtualenv dev)
+* Pytest 6+
 
-Requireds Python 3.7+
+#### Setup
+* cd into the root of the project
+* type these commands
+```shell script
+$ python3 -m pip install pipenv 
+$ pipenv --python 3.7
+$ pipenv install
+$ pipenv shell
+```
+This will drop you into the virtualenv, with the right packages already installed. Next, all you need to do, is run the tests, then run the app, which should look someething like this:
+
+
+```shell script
+(email-prune) [23:02:47][~/Projects/Coding/Python/email-prune]
+gmgauthier@shackleton $ pytest -vv                         
+=================== test session starts ==============================================================================
+platform darwin -- Python 3.8.6, pytest-6.1.1, py-1.9.0, pluggy-0.13.1 -- /Users/gmgauthier/.local/share/virtualenvs/email-prune-6GbCapbV/bin/python
+cachedir: .pytest_cache
+rootdir: /Users/gmgauthier/Projects/Coding/Python/email-prune
+collected 6 items
+
+test_email_pruner.py::test_email_creation PASSED                                                                 [ 16%]
+test_email_pruner.py::test_dup_list_creation PASSED                                                              [ 33%]
+test_email_pruner.py::test_compare_dups_and_pruned PASSED                                                        [ 50%]
+test_email_pruner.py::test_alternative_pruner PASSED                                                             [ 66%]
+test_email_pruner.py::test_random_string_contents PASSED                                                         [ 83%]
+test_email_pruner.py::test_random_string_len PASSED                                                              [100%]
+
+============================ 6 passed in 0.03s ========================================================================
+(email-prune) [23:02:55][~/Projects/Coding/Python/email-prune]
+gmgauthier@shackleton $ python ./email_pruner.py -e 750000              
+GENERATED COMPLETE LIST WITH DUPLICATES: (count = 1500000)
+Elapsed Time:  0:00:57.541989
+IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = 750000)
+Elapsed time:  0:00:00.545665
+
+TOTAL ELAPSED TIME: 0:00:58.087654
+```
+
+### NOTES
+I have a lot of comments in the code and on the tests, that explain my reasoning around certain decisions. I'll just explain the console output here. 
+
+What you're seeing echoed out to the console is a record of the amount of time it took to execut the two major steps in this code (a) the generation of the emai list (which includes the duplications inserted in random order), and the amount of time it took to execute the identification of those duplications, including bifurcating the list into two separate lists: originals, and duplicates. As you can see, this particular execution was a sort of simple "load test" on the app. The requirements called for isolating the duplicates in 100,000 emails in less than a second. This code was able to do 1.5 million, in 546 milliseconds. Not bad! 
+
+The tests are run with pytest. They are designed to run quickly. I'm only seeding 100 emails. The point is merely to demonstrate the functionality of the methods I wrote, and to showcase the importance of TESTING the application (and to demonstrate that I can reason good assertions from the requirements). 
+
+I should mention, I could have wrapped the tests in the Behave DSL, but chose not to for this challenge because the nature of the work being done in this application is at the functional integration level, rather than at the level of user interaction. Gherkin specifications are best used in the context of a behavioral relationship between user and application, rather than as a tool for "englishifying" component level specifications. The "raw" test code is much more instructive, if you know what you're looking for.
 
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..b460558
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+from email_pruner import spawn
+
+
+@pytest.fixture(scope="session", autouse=True)
+def emails():
+    return spawn(100)
diff --git a/email_pruner.py b/email_pruner.py
index 6c3b519..475adff 100644
--- a/email_pruner.py
+++ b/email_pruner.py
@@ -1,7 +1,8 @@
-from string import ascii_letters
-from secrets import choice
-from timeit import default_timer as timer
+from argparse import ArgumentParser as ap
 from datetime import timedelta
+from secrets import choice
+from string import ascii_letters
+from timeit import default_timer as timer
 
 
 def reset_stopwatch():
@@ -10,7 +11,7 @@ def reset_stopwatch():
 
 def get_elapsed(starttime):
     end = timer()
-    return timedelta(seconds=end-starttime)
+    return timedelta(seconds=end - starttime)
 
 
 def randstring(strlen=64):
@@ -18,7 +19,7 @@ def randstring(strlen=64):
 
 
 def spawn(listlen=100):
-    base_list = [randstring(10)+"."+randstring(10)+"@"+randstring(15)+".com" for _ in range(listlen)]
+    base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]
     dup_list = [choice(base_list) for _ in range(len(base_list))]
     final_list = []
     for i in range(listlen):
@@ -34,29 +35,50 @@ def dups(biglist):
         if x not in seen:
             uneek.append(x)
             seen.add(x)
-    return seen
+    return list(seen), uneek
+
+
+# NOTE:
+# In the event that you do not need both lists,
+# there is a much simpler, more "pythonic", way
+# to do the pruning with python:
+def prune(biglist):
+    return list(dict.fromkeys(biglist))
 
 
 if __name__ == "__main__":
+    parser = ap()
+    parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",
+                        help='The number of emails to generate (default=1000)', required=False)
+    args = parser.parse_args()
+    email_count = args.emails
+
+    # NOTE: The spawning process takes an enormous amount of time,
+    # but since the challenge didn't say anything about how long it takes to
+    # generate 100,000 emails (only how long it takes to de-dupe them), I
+    # didn't do much to try to optimize the creation of the list of emails.
+    # But I will say, that I kept it entirely in memory, to avoid having to
+    # deal with disk i/o.
     start = reset_stopwatch()
-    list_with_dups = spawn(50000)
+    list_with_dups = spawn(email_count)
     print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")
     # [print(i) for i in list_with_dups]
     t1 = get_elapsed(start)
     print("Elapsed Time: ", t1)
 
+
+    # This is the part we really care about. This step takes the generated list,
+    # and runs it through the de-duplicator, returning two lists: the originals,
+    # and the duplicates. Note, that these lists are identical in LENGTH ONLY,
+    # because the bifurcation process leaves them unsorted, according to the
+    # requirements. If sorted, they could be shown to be identical in content
+    # as well.
     start = reset_stopwatch()
-    dup_list = dups(list_with_dups)
+    dup_list, orig_list = dups(list_with_dups)
     print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")
     # [print(i) for i in dup_list]
     t2 = get_elapsed(start)
     print("Elapsed time: ", t2)
 
-    start = reset_stopwatch()
-    list_with_dups = list(dict.fromkeys(list_with_dups))
-    print(f"GENERATED PRUNED LIST WITHOUT DUPLICATES: (count = {len(list_with_dups)})")
-    # [print(i) for i in list_with_dups]
-    t3 = get_elapsed(start)
-    print("Elapsed Time: ", t3)
-    print(f"TOTAL ELAPSED TIME: {t1+t2+t3}")
-    print(f"ELAPSED TIME WITHOUT GENERATOR: {t2+t3}")
+    print("\n\n")
+    print(f"TOTAL ELAPSED TIME: {t1 + t2}")
diff --git a/test_email_pruner.py b/test_email_pruner.py
new file mode 100644
index 0000000..50dd5fd
--- /dev/null
+++ b/test_email_pruner.py
@@ -0,0 +1,59 @@
+# functional pytests
+from email_pruner import dups, randstring, prune
+
+
+# pytest uses some under-the-covers magic to make fixtures available to
+# test methods. Check the "conftest.py" for the source of the "emails"
+# argument you see in these tests.
+
+def test_email_creation(emails):
+    # spawn will return double the number requested,
+    # because it generates randomized duplicates of
+    # every email created. So, our test should be for
+    # double the amount requested.
+    assert len(emails) == 200
+
+
+def test_dup_list_creation(emails):
+    # the dups function copies out the duplicates
+    # into a fresh list, giving us the freedom to
+    # do what we like with them. Since 50% was the
+    # requirement, this means that the dup list
+    # should be just as long as the pruned list
+    dup_list, prune_list = dups(emails)
+    assert len(dup_list) == 100
+    assert len(dup_list) == len(prune_list)
+
+
+def test_compare_dups_and_pruned(emails):
+    # the original spec says to leave the two lists unsorted
+    # so name-for-name comparison should fail, because the
+    # bifurcation process is going to create two disparately
+    # ordered lists. So, here I include two assertions to
+    # first confirm that the unordered lists are mismatched,
+    # and second, confirm that the sorted lists can be shown
+    # to be identical in content.
+    dup_list, prune_list = dups(emails)
+    assert not dup_list == prune_list
+    assert dup_list.sort() == prune_list.sort()
+
+
+def test_alternative_pruner(emails):
+    # Python dictionaries provide a MUCH cleaner and simpler
+    # technique for pruning duplicates from a list. However,
+    # it discards the duplicates, rather than giving them
+    # back to you. So, I provide this here just for illustration.
+    # We want both lists, so that I can prove that my code worked.
+    pruned = prune(emails)
+    assert len(pruned) == 100
+
+
+def test_random_string_contents():
+    rstring = randstring() # default length = 64, alphas only.
+    not_allowed = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~0123456789"
+    assert not rstring.__contains__(not_allowed)
+
+
+def test_random_string_len():
+    rstring = randstring() # default length
+    assert len(rstring) == 64
\ No newline at end of file