Add initial scanner

2018-12-18 15:18:04 +01:00
commit 7fbf7e296b
4 changed files with 80 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+.vscode/
--- a/app.py
+++ b/app.py
@@ -0,0 +1,3 @@
+from lib.scanner import Scanner
+
+print("Yet to be implemented")
--- a/lib/scanner.py
+++ b/lib/scanner.py
@@ -0,0 +1,74 @@
+from requests import get
+from requests.exceptions import RequestException
+from contextlib import closing
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import NoSuchElementException
+import platform
+import time
+import re
+import csv
+import sys
+
+class Scanner:
+  def __init__(self, urls):
+    self.urls = urls
+
+  """
+    Check if the server returned status OK, the content type header is set and it is html
+  """
+  def response_is_correct(self, resp):
+    content_type = resp.headers['Content-Type'].lower()
+    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
+
+  """
+    Get the the first name and email pair on the page
+  """
+  def get_contact(self):
+    res = []
+    mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
+    # Loop through all pages
+    for url in self.urls:
+      try:
+        with closing(get(url, stream=True)) as resp:
+          if self.response_is_correct(resp):
+            # Set properties for selenium
+            chrome_options = Options()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--window-size=1920x1080")
+            driver = webdriver.Chrome(chrome_options=chrome_options)
+            driver.get(url)
+
+            time.sleep(2) # Give the page some time to load
+
+            # Get the entire page body
+            page_content = driver.execute_script("return document.body.innerHTML;")
+
+            for keyword in keywords:
+              try:
+                person = {}
+                person["Email"] = re.findall(mail_pattern, page_content)[0]
+                person["Name"] = "Meyer"
+                res.append(person)
+                break
+              except NoSuchElementException:
+                print("INFO: No results for keyword {0} in {1}").format(keyword, url)
+
+          else:
+            print("ERROR: The response did not pass the criteria for correctness")
+            sys.exit(1)
+      except RequestException as e:
+        print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
+
+    # Write result to CSV
+    try:
+      keys = res[0].keys()
+      with open('../people.csv', 'wb') as output_file:
+        dict_writer = csv.DictWriter(output_file, keys)
+        dict_writer.writeheader()
+        dict_writer.writerows(res)
+    except Exception as e:
+      print('ERROR: Failed writing to CSV file: {0}').format(str(e))
+      sys.exit(1)
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+requests>=2.20.1
+selenium>=3.141.0