commit 7fbf7e296bc14320336087cd494de02a10550e70 Author: Ivaylo Ivanov Date: Tue Dec 18 15:18:04 2018 +0100 Add initial scanner diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dbe9c82 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.vscode/ \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..99e6c4a --- /dev/null +++ b/app.py @@ -0,0 +1,3 @@ +from lib.scanner import Scanner + +print("Yet to be implemented") diff --git a/lib/scanner.py b/lib/scanner.py new file mode 100644 index 0000000..5d2673b --- /dev/null +++ b/lib/scanner.py @@ -0,0 +1,74 @@ +from requests import get +from requests.exceptions import RequestException +from contextlib import closing +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.common.exceptions import NoSuchElementException +import platform +import time +import re +import csv +import sys + +class Scanner: + def __init__(self, urls): + self.urls = urls + + """ + Check if the server returned status OK, the content type header is set and it is html + """ + def response_is_correct(self, resp): + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) + + """ + Get the the first name and email pair on the page + """ + def get_contact(self): + res = [] + mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]") + # Loop through all pages + for url in self.urls: + try: + with closing(get(url, stream=True)) as resp: + if self.response_is_correct(resp): + # Set properties for selenium + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--window-size=1920x1080") + driver = webdriver.Chrome(chrome_options=chrome_options) + driver.get(url) + + time.sleep(2) # Give the page some time to load + + # Get the entire page body + page_content = driver.execute_script("return document.body.innerHTML;") + + for keyword in keywords: + try: + person = {} + person["Email"] = re.findall(mail_pattern, page_content)[0] + person["Name"] = "Meyer" + res.append(person) + break + except NoSuchElementException: + print("INFO: No results for keyword {0} in {1}").format(keyword, url) + + else: + print("ERROR: The response did not pass the criteria for correctness") + sys.exit(1) + except RequestException as e: + print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e))) + + # Write result to CSV + try: + keys = res[0].keys() + with open('../people.csv', 'wb') as output_file: + dict_writer = csv.DictWriter(output_file, keys) + dict_writer.writeheader() + dict_writer.writerows(res) + except Exception as e: + print('ERROR: Failed writing to CSV file: {0}').format(str(e)) + sys.exit(1) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..46ac4fd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests>=2.20.1 +selenium>=3.141.0 \ No newline at end of file