from requests import get from requests.exceptions import RequestException from contextlib import closing from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException from bs4 import BeautifulSoup import platform import time import re import csv import sys class Scanner: def __init__(self, urls, keywords): self.urls = urls self.keywords = keywords """ Check if the server returned status OK, the content type header is set and it is html """ def response_is_correct(self, resp): content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) """ Get the the first name and email pair on the page """ def get_contacts(self): res = [] mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$") # Set properties for selenium chrome_options = Options() chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") driver = webdriver.Chrome(chrome_options=chrome_options) # Loop through all pages for url in self.urls: try: with closing(get(url, stream=True)) as resp: if self.response_is_correct(resp): print("################################") print("INFO: Scraping {0}".format(url)) # Open in selenium driver.get(url) # Click on Impressum try: impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]') driver.execute_script("arguments[0].click()", impressum_link) # Get the entire page body soup = BeautifulSoup(driver.page_source, 'html.parser') person = {} mail = "" name = "" for keyword in self.keywords: person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]" person_pattern = re.compile(person_regex) # Find the first thing that matches the mail pattern and the keyword mail = soup.find(text = mail_pattern) name = soup.find(text = person_pattern) if mail: person["Email"] = mail else: print("WARNING: Did not find email on {0}".format(url)) break if name: to_replace = '"\t\n' for char in to_replace: name = name.replace(char, "") person["Name"] = name res.append(person) print("SUCCESS: Pair found on {0}".format(url)) break else: print("WARNING: Did not find keyword {0} on {1}".format(keyword, url)) if mail != None and name == None: person["Name"] = name person["Email"] = mail res.append(person) print("INFO: No keyword matches found for {0}. Appending only Email...".format(url)) except NoSuchElementException as e: print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e))) else: print("ERROR: The response from {0} did not pass the criteria for correctness".format(url)) except RequestException as e: print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e))) # Write result to CSV try: keys = res[0].keys() with open('people.csv', 'w') as output_file: # Empty file contents output_file.truncate(0) dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(res) output_file.close() print('SUCCESS: Successfully wrote to CSV file') sys.exit(0) except Exception as e: print('FATAL: Failed writing to CSV file: {0}'.format(str(e))) sys.exit(1)