scanner.py 4.23 KB
Newer Older
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
1 2 3 4 5 6 7
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
8
from bs4 import BeautifulSoup
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
9 10 11 12 13 14 15
import platform
import time
import re
import csv
import sys

class Scanner:
16
  def __init__(self, urls, keywords):
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
17
    self.urls = urls
18
    self.keywords = keywords
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
19 20 21 22 23 24 25 26 27 28 29

  """
    Check if the server returned status OK, the content type header is set and it is html
  """
  def response_is_correct(self, resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

  """
    Get the the first name and email pair on the page
  """
30
  def get_contacts(self):
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
31
    res = []
32 33 34 35 36 37 38 39
    mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
    # Set properties for selenium
    chrome_options = Options()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(chrome_options=chrome_options)

Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
40 41 42 43 44
    # Loop through all pages
    for url in self.urls:
      try:
        with closing(get(url, stream=True)) as resp:
          if self.response_is_correct(resp):
45 46 47 48
            print("################################")
            print("INFO: Scraping {0}".format(url))

            # Open in selenium
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
49 50
            driver.get(url)

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
            # Click on Impressum
            try:
              impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
              driver.execute_script("arguments[0].click()", impressum_link)

              # Get the entire page body
              soup = BeautifulSoup(driver.page_source, 'html.parser')
              person = {}
              mail = ""
              name = ""
              for keyword in self.keywords:
                person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
                person_pattern = re.compile(person_regex)
                # Find the first thing that matches the mail pattern and the keyword
                mail = soup.find(text = mail_pattern)
                name = soup.find(text = person_pattern)

                if mail:
                  person["Email"] = mail
                else:
                  print("WARNING: Did not find email on {0}".format(url))
                  break
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
73

74 75 76 77 78 79 80 81 82 83
                if name:
                  to_replace = '"\t\n'
                  for char in to_replace:
                    name = name.replace(char, "")
                  person["Name"] = name
                  res.append(person)
                  print("SUCCESS: Pair found on {0}".format(url))
                  break
                else:
                  print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
84

85 86 87
              if mail != None and name == None:
                person["Name"] = name
                person["Email"] = mail
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
88
                res.append(person)
89
                print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
90

91 92
            except NoSuchElementException as e:
              print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
93
          else:
94
            print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
95
      except RequestException as e:
96
        print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
97 98 99 100

    # Write result to CSV
    try:
      keys = res[0].keys()
101 102 103
      with open('people.csv', 'w') as output_file:
        # Empty file contents
        output_file.truncate(0)
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
104 105 106
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(res)
107 108 109
        output_file.close()
        print('SUCCESS: Successfully wrote to CSV file')
        sys.exit(0)
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
110
    except Exception as e:
111
      print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
112 113
      sys.exit(1)