119 lines
4.4 KiB
Python
119 lines
4.4 KiB
Python
from requests import get
|
|
from requests.exceptions import RequestException
|
|
from requests.packages import urllib3
|
|
from urllib3.exceptions import InsecureRequestWarning
|
|
from contextlib import closing
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.common.exceptions import NoSuchElementException
|
|
from bs4 import BeautifulSoup
|
|
import platform
|
|
import time
|
|
import re
|
|
import csv
|
|
import sys
|
|
|
|
class Scanner:
|
|
def __init__(self, urls, keywords):
|
|
self.urls = urls
|
|
self.keywords = keywords
|
|
|
|
"""
|
|
Check if the server returned status OK, the content type header is set and it is html
|
|
"""
|
|
def response_is_correct(self, resp):
|
|
content_type = resp.headers['Content-Type'].lower()
|
|
return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
|
|
|
|
"""
|
|
Get the the first name and email pair on the page
|
|
"""
|
|
def get_contacts(self):
|
|
res = []
|
|
mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
|
|
# Set properties for selenium
|
|
chrome_options = Options()
|
|
chrome_options.add_argument('--ignore-certificate-errors')
|
|
chrome_options.add_argument("--headless")
|
|
chrome_options.add_argument("--window-size=1920x1080")
|
|
driver = webdriver.Chrome(chrome_options=chrome_options)
|
|
|
|
# Disable HTTPS request warnings
|
|
urllib3.disable_warnings(category=InsecureRequestWarning)
|
|
|
|
# Loop through all pages
|
|
for url in self.urls:
|
|
try:
|
|
with closing(get(url, stream=True, verify=False)) as resp:
|
|
if self.response_is_correct(resp):
|
|
print("################################")
|
|
print("INFO: Scraping {0}".format(url))
|
|
|
|
# Open in selenium
|
|
driver.get(url)
|
|
|
|
# Click on Impressum
|
|
try:
|
|
impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
|
|
driver.execute_script("arguments[0].click()", impressum_link)
|
|
|
|
# Get the entire page body
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
person = {}
|
|
mail = ""
|
|
name = ""
|
|
for keyword in self.keywords:
|
|
person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
|
|
person_pattern = re.compile(person_regex)
|
|
# Find the first thing that matches the mail pattern and the keyword
|
|
mail = soup.find(text = mail_pattern)
|
|
name = soup.find(text = person_pattern)
|
|
|
|
if mail:
|
|
person["Email"] = mail
|
|
else:
|
|
print("WARNING: Did not find email on {0}".format(url))
|
|
break
|
|
|
|
if name:
|
|
to_replace = '"\t\n'
|
|
for char in to_replace:
|
|
name = name.replace(char, "")
|
|
person["Name"] = name
|
|
res.append(person)
|
|
print("SUCCESS: Pair found on {0}".format(url))
|
|
break
|
|
else:
|
|
print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
|
|
|
|
if mail != None and name == None:
|
|
person["Name"] = name
|
|
person["Email"] = mail
|
|
res.append(person)
|
|
print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
|
|
|
|
except NoSuchElementException as e:
|
|
print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
|
|
else:
|
|
print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
|
|
except RequestException as e:
|
|
print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
|
|
|
|
# Write result to CSV
|
|
try:
|
|
keys = res[0].keys()
|
|
with open('people.csv', 'w') as output_file:
|
|
# Empty file contents
|
|
output_file.truncate(0)
|
|
dict_writer = csv.DictWriter(output_file, keys)
|
|
dict_writer.writeheader()
|
|
dict_writer.writerows(res)
|
|
output_file.close()
|
|
print('SUCCESS: Successfully wrote to CSV file')
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
|
|
sys.exit(1)
|
|
|