from requests import get from requests.exceptions import RequestException from contextlib import closing from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException import platform import time import re import csv import sys class Scanner: def __init__(self, urls): self.urls = urls """ Check if the server returned status OK, the content type header is set and it is html """ def response_is_correct(self, resp): content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) """ Get the the first name and email pair on the page """ def get_contact(self): res = [] mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]") # Loop through all pages for url in self.urls: try: with closing(get(url, stream=True)) as resp: if self.response_is_correct(resp): # Set properties for selenium chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) time.sleep(2) # Give the page some time to load # Get the entire page body page_content = driver.execute_script("return document.body.innerHTML;") for keyword in keywords: try: person = {} person["Email"] = re.findall(mail_pattern, page_content)[0] person["Name"] = "Meyer" res.append(person) break except NoSuchElementException: print("INFO: No results for keyword {0} in {1}").format(keyword, url) else: print("ERROR: The response did not pass the criteria for correctness") sys.exit(1) except RequestException as e: print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e))) # Write result to CSV try: keys = res[0].keys() with open('../people.csv', 'wb') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(res) except Exception as e: print('ERROR: Failed writing to CSV file: {0}').format(str(e)) sys.exit(1)