Add initial scanner

This commit is contained in:
Ivaylo Ivanov 2018-12-18 15:18:04 +01:00
commit 7fbf7e296b
4 changed files with 80 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.vscode/

3
app.py Normal file
View File

@ -0,0 +1,3 @@
from lib.scanner import Scanner
print("Yet to be implemented")

74
lib/scanner.py Normal file
View File

@ -0,0 +1,74 @@
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import platform
import time
import re
import csv
import sys
class Scanner:
def __init__(self, urls):
self.urls = urls
"""
Check if the server returned status OK, the content type header is set and it is html
"""
def response_is_correct(self, resp):
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
"""
Get the the first name and email pair on the page
"""
def get_contact(self):
res = []
mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
# Loop through all pages
for url in self.urls:
try:
with closing(get(url, stream=True)) as resp:
if self.response_is_correct(resp):
# Set properties for selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
time.sleep(2) # Give the page some time to load
# Get the entire page body
page_content = driver.execute_script("return document.body.innerHTML;")
for keyword in keywords:
try:
person = {}
person["Email"] = re.findall(mail_pattern, page_content)[0]
person["Name"] = "Meyer"
res.append(person)
break
except NoSuchElementException:
print("INFO: No results for keyword {0} in {1}").format(keyword, url)
else:
print("ERROR: The response did not pass the criteria for correctness")
sys.exit(1)
except RequestException as e:
print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
# Write result to CSV
try:
keys = res[0].keys()
with open('../people.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(res)
except Exception as e:
print('ERROR: Failed writing to CSV file: {0}').format(str(e))
sys.exit(1)

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests>=2.20.1
selenium>=3.141.0