contact-scan/lib/scanner.py

114 lines
4.2 KiB
Python

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import platform
import time
import re
import csv
import sys
class Scanner:
def __init__(self, urls, keywords):
self.urls = urls
self.keywords = keywords
"""
Check if the server returned status OK, the content type header is set and it is html
"""
def response_is_correct(self, resp):
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
"""
Get the the first name and email pair on the page
"""
def get_contacts(self):
res = []
mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
# Set properties for selenium
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options)
# Loop through all pages
for url in self.urls:
try:
with closing(get(url, stream=True)) as resp:
if self.response_is_correct(resp):
print("################################")
print("INFO: Scraping {0}".format(url))
# Open in selenium
driver.get(url)
# Click on Impressum
try:
impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
driver.execute_script("arguments[0].click()", impressum_link)
# Get the entire page body
soup = BeautifulSoup(driver.page_source, 'html.parser')
person = {}
mail = ""
name = ""
for keyword in self.keywords:
person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
person_pattern = re.compile(person_regex)
# Find the first thing that matches the mail pattern and the keyword
mail = soup.find(text = mail_pattern)
name = soup.find(text = person_pattern)
if mail:
person["Email"] = mail
else:
print("WARNING: Did not find email on {0}".format(url))
break
if name:
to_replace = '"\t\n'
for char in to_replace:
name = name.replace(char, "")
person["Name"] = name
res.append(person)
print("SUCCESS: Pair found on {0}".format(url))
break
else:
print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
if mail != None and name == None:
person["Name"] = name
person["Email"] = mail
res.append(person)
print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
except NoSuchElementException as e:
print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
else:
print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
except RequestException as e:
print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
# Write result to CSV
try:
keys = res[0].keys()
with open('people.csv', 'w') as output_file:
# Empty file contents
output_file.truncate(0)
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(res)
output_file.close()
print('SUCCESS: Successfully wrote to CSV file')
sys.exit(0)
except Exception as e:
print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
sys.exit(1)