Add initial scanner
This commit is contained in:
commit
7fbf7e296b
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.vscode/
|
3
app.py
Normal file
3
app.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from lib.scanner import Scanner
|
||||||
|
|
||||||
|
print("Yet to be implemented")
|
74
lib/scanner.py
Normal file
74
lib/scanner.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from requests import get
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
from contextlib import closing
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
|
import platform
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
|
||||||
|
class Scanner:
|
||||||
|
def __init__(self, urls):
|
||||||
|
self.urls = urls
|
||||||
|
|
||||||
|
"""
|
||||||
|
Check if the server returned status OK, the content type header is set and it is html
|
||||||
|
"""
|
||||||
|
def response_is_correct(self, resp):
|
||||||
|
content_type = resp.headers['Content-Type'].lower()
|
||||||
|
return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Get the the first name and email pair on the page
|
||||||
|
"""
|
||||||
|
def get_contact(self):
|
||||||
|
res = []
|
||||||
|
mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
|
||||||
|
# Loop through all pages
|
||||||
|
for url in self.urls:
|
||||||
|
try:
|
||||||
|
with closing(get(url, stream=True)) as resp:
|
||||||
|
if self.response_is_correct(resp):
|
||||||
|
# Set properties for selenium
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||||||
|
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
time.sleep(2) # Give the page some time to load
|
||||||
|
|
||||||
|
# Get the entire page body
|
||||||
|
page_content = driver.execute_script("return document.body.innerHTML;")
|
||||||
|
|
||||||
|
for keyword in keywords:
|
||||||
|
try:
|
||||||
|
person = {}
|
||||||
|
person["Email"] = re.findall(mail_pattern, page_content)[0]
|
||||||
|
person["Name"] = "Meyer"
|
||||||
|
res.append(person)
|
||||||
|
break
|
||||||
|
except NoSuchElementException:
|
||||||
|
print("INFO: No results for keyword {0} in {1}").format(keyword, url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("ERROR: The response did not pass the criteria for correctness")
|
||||||
|
sys.exit(1)
|
||||||
|
except RequestException as e:
|
||||||
|
print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
|
||||||
|
|
||||||
|
# Write result to CSV
|
||||||
|
try:
|
||||||
|
keys = res[0].keys()
|
||||||
|
with open('../people.csv', 'wb') as output_file:
|
||||||
|
dict_writer = csv.DictWriter(output_file, keys)
|
||||||
|
dict_writer.writeheader()
|
||||||
|
dict_writer.writerows(res)
|
||||||
|
except Exception as e:
|
||||||
|
print('ERROR: Failed writing to CSV file: {0}').format(str(e))
|
||||||
|
sys.exit(1)
|
||||||
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
requests>=2.20.1
|
||||||
|
selenium>=3.141.0
|
Loading…
Reference in New Issue
Block a user