Add initial scanner

2018-12-18 15:18:04 +01:00
commit 7fbf7e296b
4 changed files with 80 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 .vscode/
--- a/app.py
+++ b/app.py
@@ -0,0 +1,3 @@
 from lib.scanner import Scanner
 print("Yet to be implemented")
--- a/lib/scanner.py
+++ b/lib/scanner.py
@@ -0,0 +1,74 @@
 from requests import get
 from requests.exceptions import RequestException
 from contextlib import closing
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import NoSuchElementException
 import platform
 import time
 import re
 import csv
 import sys
 class Scanner:
  def __init__(self, urls):
    self.urls = urls
  """
    Check if the server returned status OK, the content type header is set and it is html
  """
  def response_is_correct(self, resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
  """
    Get the the first name and email pair on the page
  """
  def get_contact(self):
    res = []
    mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
    # Loop through all pages
    for url in self.urls:
      try:
        with closing(get(url, stream=True)) as resp:
          if self.response_is_correct(resp):
            # Set properties for selenium
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--window-size=1920x1080")
            driver = webdriver.Chrome(chrome_options=chrome_options)
            driver.get(url)
            time.sleep(2) # Give the page some time to load
            # Get the entire page body
            page_content = driver.execute_script("return document.body.innerHTML;")
            for keyword in keywords:
              try:
                person = {}
                person["Email"] = re.findall(mail_pattern, page_content)[0]
                person["Name"] = "Meyer"
                res.append(person)
                break
              except NoSuchElementException:
                print("INFO: No results for keyword {0} in {1}").format(keyword, url)
          else:
            print("ERROR: The response did not pass the criteria for correctness")
            sys.exit(1)
      except RequestException as e:
        print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
    # Write result to CSV
    try:
      keys = res[0].keys()
      with open('../people.csv', 'wb') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(res)
    except Exception as e:
      print('ERROR: Failed writing to CSV file: {0}').format(str(e))
      sys.exit(1)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 requests>=2.20.1
 selenium>=3.141.0
		`@@ -0,0 +1,3 @@`
							`from lib.scanner import Scanner`

							`print("Yet to be implemented")`