contact-scan/lib/scanner.py

114 lines
4.2 KiB
Python
Raw Normal View History

2018-12-18 14:18:04 +00:00
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
2018-12-18 21:55:28 +00:00
from bs4 import BeautifulSoup
2018-12-18 14:18:04 +00:00
import platform
import time
import re
import csv
import sys
class Scanner:
2018-12-18 21:55:28 +00:00
def __init__(self, urls, keywords):
2018-12-18 14:18:04 +00:00
self.urls = urls
2018-12-18 21:55:28 +00:00
self.keywords = keywords
2018-12-18 14:18:04 +00:00
"""
Check if the server returned status OK, the content type header is set and it is html
"""
def response_is_correct(self, resp):
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
"""
Get the the first name and email pair on the page
"""
2018-12-18 21:55:28 +00:00
def get_contacts(self):
2018-12-18 14:18:04 +00:00
res = []
2018-12-18 21:55:28 +00:00
mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
# Set properties for selenium
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options)
2018-12-18 14:18:04 +00:00
# Loop through all pages
for url in self.urls:
try:
with closing(get(url, stream=True)) as resp:
if self.response_is_correct(resp):
2018-12-18 21:55:28 +00:00
print("################################")
print("INFO: Scraping {0}".format(url))
# Open in selenium
2018-12-18 14:18:04 +00:00
driver.get(url)
2018-12-18 21:55:28 +00:00
# Click on Impressum
try:
impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
driver.execute_script("arguments[0].click()", impressum_link)
# Get the entire page body
soup = BeautifulSoup(driver.page_source, 'html.parser')
person = {}
mail = ""
name = ""
for keyword in self.keywords:
person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
person_pattern = re.compile(person_regex)
# Find the first thing that matches the mail pattern and the keyword
mail = soup.find(text = mail_pattern)
name = soup.find(text = person_pattern)
if mail:
person["Email"] = mail
else:
print("WARNING: Did not find email on {0}".format(url))
break
2018-12-18 14:18:04 +00:00
2018-12-18 21:55:28 +00:00
if name:
to_replace = '"\t\n'
for char in to_replace:
name = name.replace(char, "")
person["Name"] = name
res.append(person)
print("SUCCESS: Pair found on {0}".format(url))
break
else:
print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
2018-12-18 14:18:04 +00:00
2018-12-18 21:55:28 +00:00
if mail != None and name == None:
person["Name"] = name
person["Email"] = mail
2018-12-18 14:18:04 +00:00
res.append(person)
2018-12-18 21:55:28 +00:00
print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
2018-12-18 14:18:04 +00:00
2018-12-18 21:55:28 +00:00
except NoSuchElementException as e:
print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
2018-12-18 14:18:04 +00:00
else:
2018-12-18 21:55:28 +00:00
print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
2018-12-18 14:18:04 +00:00
except RequestException as e:
2018-12-18 21:55:28 +00:00
print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
2018-12-18 14:18:04 +00:00
# Write result to CSV
try:
keys = res[0].keys()
2018-12-18 21:55:28 +00:00
with open('people.csv', 'w') as output_file:
# Empty file contents
output_file.truncate(0)
2018-12-18 14:18:04 +00:00
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(res)
2018-12-18 21:55:28 +00:00
output_file.close()
print('SUCCESS: Successfully wrote to CSV file')
sys.exit(0)
2018-12-18 14:18:04 +00:00
except Exception as e:
2018-12-18 21:55:28 +00:00
print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
2018-12-18 14:18:04 +00:00
sys.exit(1)