scanner.py 4.43 KB
Newer Older
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
1 2
from requests import get
from requests.exceptions import RequestException
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
3 4
from requests.packages import urllib3
from urllib3.exceptions import InsecureRequestWarning
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
5 6 7 8 9
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
10
from bs4 import BeautifulSoup
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
11 12 13 14 15 16 17
import platform
import time
import re
import csv
import sys

class Scanner:
18
  def __init__(self, urls, keywords):
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
19
    self.urls = urls
20
    self.keywords = keywords
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
21 22 23 24 25 26 27 28 29 30 31

  """
    Check if the server returned status OK, the content type header is set and it is html
  """
  def response_is_correct(self, resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

  """
    Get the the first name and email pair on the page
  """
32
  def get_contacts(self):
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
33
    res = []
34 35 36 37 38 39 40 41
    mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
    # Set properties for selenium
    chrome_options = Options()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(chrome_options=chrome_options)

Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
42 43 44
    # Disable HTTPS request warnings
    urllib3.disable_warnings(category=InsecureRequestWarning)

Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
45 46 47
    # Loop through all pages
    for url in self.urls:
      try:
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
48
        with closing(get(url, stream=True, verify=False)) as resp:
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
49
          if self.response_is_correct(resp):
50 51 52 53
            print("################################")
            print("INFO: Scraping {0}".format(url))

            # Open in selenium
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
54 55
            driver.get(url)

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
            # Click on Impressum
            try:
              impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
              driver.execute_script("arguments[0].click()", impressum_link)

              # Get the entire page body
              soup = BeautifulSoup(driver.page_source, 'html.parser')
              person = {}
              mail = ""
              name = ""
              for keyword in self.keywords:
                person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
                person_pattern = re.compile(person_regex)
                # Find the first thing that matches the mail pattern and the keyword
                mail = soup.find(text = mail_pattern)
                name = soup.find(text = person_pattern)

                if mail:
                  person["Email"] = mail
                else:
                  print("WARNING: Did not find email on {0}".format(url))
                  break
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
78

79 80 81 82 83 84 85 86 87 88
                if name:
                  to_replace = '"\t\n'
                  for char in to_replace:
                    name = name.replace(char, "")
                  person["Name"] = name
                  res.append(person)
                  print("SUCCESS: Pair found on {0}".format(url))
                  break
                else:
                  print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
89

90 91 92
              if mail != None and name == None:
                person["Name"] = name
                person["Email"] = mail
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
93
                res.append(person)
94
                print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
95

96 97
            except NoSuchElementException as e:
              print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
98
          else:
99
            print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
100
      except RequestException as e:
101
        print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
102 103 104 105

    # Write result to CSV
    try:
      keys = res[0].keys()
106 107 108
      with open('people.csv', 'w') as output_file:
        # Empty file contents
        output_file.truncate(0)
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
109 110 111
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(res)
112 113 114
        output_file.close()
        print('SUCCESS: Successfully wrote to CSV file')
        sys.exit(0)
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
115
    except Exception as e:
116
      print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
117 118
      sys.exit(1)