contact-scan/lib/scanner.py

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import platform
import time
import re
import csv
import sys

class Scanner:
  def __init__(self, urls, keywords):
    self.urls = urls
    self.keywords = keywords

  """
    Check if the server returned status OK, the content type header is set and it is html
  """
  def response_is_correct(self, resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

  """
    Get the the first name and email pair on the page
  """
  def get_contacts(self):
    res = []
    mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
    # Set properties for selenium
    chrome_options = Options()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(chrome_options=chrome_options)

    # Loop through all pages
    for url in self.urls:
      try:
        with closing(get(url, stream=True)) as resp:
          if self.response_is_correct(resp):
            print("################################")
            print("INFO: Scraping {0}".format(url))

            # Open in selenium
            driver.get(url)

            # Click on Impressum
            try:
              impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
              driver.execute_script("arguments[0].click()", impressum_link)

              # Get the entire page body
              soup = BeautifulSoup(driver.page_source, 'html.parser')
              person = {}
              mail = ""
              name = ""
              for keyword in self.keywords:
                person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
                person_pattern = re.compile(person_regex)
                # Find the first thing that matches the mail pattern and the keyword
                mail = soup.find(text = mail_pattern)
                name = soup.find(text = person_pattern)

                if mail:
                  person["Email"] = mail
                else:
                  print("WARNING: Did not find email on {0}".format(url))
                  break

                if name:
                  to_replace = '"\t\n'
                  for char in to_replace:
                    name = name.replace(char, "")
                  person["Name"] = name
                  res.append(person)
                  print("SUCCESS: Pair found on {0}".format(url))
                  break
                else:
                  print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))

              if mail != None and name == None:
                person["Name"] = name
                person["Email"] = mail
                res.append(person)
                print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))

            except NoSuchElementException as e:
              print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
          else:
            print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
      except RequestException as e:
        print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))

    # Write result to CSV
    try:
      keys = res[0].keys()
      with open('people.csv', 'w') as output_file:
        # Empty file contents
        output_file.truncate(0)
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(res)
        output_file.close()
        print('SUCCESS: Successfully wrote to CSV file')
        sys.exit(0)
    except Exception as e:
      print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
      sys.exit(1)