Add initial scraping capabilities
This commit is contained in:
parent
7fbf7e296b
commit
d279e4531a
6
.gitignore
vendored
6
.gitignore
vendored
@ -1 +1,5 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
|
*.csv
|
||||||
|
*.txt
|
||||||
|
!requirements.txt
|
||||||
|
__pycache__/
|
43
README.md
Normal file
43
README.md
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# Contact Scanner
|
||||||
|
## What is this?
|
||||||
|
The project is a small python web scraper with Selenium and BeautifulSoup.
|
||||||
|
|
||||||
|
## What does it do?
|
||||||
|
The scraper goes to the impressum page of a given website and scans it for an email address and a name, following the keywords defined in a supplied file. After it scrapes the page, it writes the results in a csv file.
|
||||||
|
|
||||||
|
**NOTE:** The scraper does **NOT** return a 100% correct email-name pairs. It returns the pairs that it can **build**. This means that you should always take the results with a grain of salt.
|
||||||
|
|
||||||
|
## How to use it?
|
||||||
|
### Prerequisites
|
||||||
|
You are going to need the following things installed:
|
||||||
|
* Chrome
|
||||||
|
* Python 3
|
||||||
|
* Pip3
|
||||||
|
* Selenium Chrome driver
|
||||||
|
|
||||||
|
After you have these 4 installed, go on.
|
||||||
|
### Dependecies
|
||||||
|
The dependencies are listed in [requirements.txt](requirements.txt). Install them with the following command:
|
||||||
|
```
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
The application has the following synopsis:
|
||||||
|
```
|
||||||
|
SYNOPSIS
|
||||||
|
|
||||||
|
python3 app.py URL_FILE KEYWORD_FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
where ```URL_FILE``` is a file with a list of URLs that should be scanned with each URL on new line and ```KEYWORD_FILE``` contains a list of keywords based on which you will search for names. The format of the file is the same(you should trim the trailing whitespaces for best results).
|
||||||
|
|
||||||
|
### Usage constraints
|
||||||
|
You should **NOT**
|
||||||
|
1. use this scraper for generating spam lists
|
||||||
|
2. use this scraper without acknowledging the `robots.txt` of the target
|
||||||
|
3. use this scraper when you have explicitly agreed with the website not to scrape it
|
||||||
|
4. use this scraper if you're not using it under fair use
|
||||||
|
|
||||||
|
## Fair use
|
||||||
|
The scraper falls under fair use because it is designed to search for *facts* in pages and not for *content*
|
26
app.py
26
app.py
@ -1,3 +1,27 @@
|
|||||||
|
import sys
|
||||||
from lib.scanner import Scanner
|
from lib.scanner import Scanner
|
||||||
|
|
||||||
print("Yet to be implemented")
|
if len(sys.argv) != 3:
|
||||||
|
print("Incorrect number of arguments supplied. Usage:")
|
||||||
|
print("python3 app.py URL_FILE KEYWORD_FILE")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Get filenames
|
||||||
|
url_filename = sys.argv[1]
|
||||||
|
keyword_filename = sys.argv[2]
|
||||||
|
|
||||||
|
# Open the url file and get the list of URLs
|
||||||
|
url_file = open(url_filename, 'r')
|
||||||
|
urls = url_file.read().split('\n')
|
||||||
|
|
||||||
|
# Replace spaces
|
||||||
|
for url in urls:
|
||||||
|
url = url.replace(" ", "")
|
||||||
|
|
||||||
|
# Open the keyword file and get the list of keywords
|
||||||
|
keyword_file = open(keyword_filename, 'r')
|
||||||
|
keywords = keyword_file.read().split('\n')
|
||||||
|
|
||||||
|
# Scan the contacts in the URL
|
||||||
|
contact_scanner = Scanner(urls, keywords)
|
||||||
|
contact_scanner.get_contacts()
|
||||||
|
0
lib/__init__.py
Normal file
0
lib/__init__.py
Normal file
@ -5,6 +5,7 @@ from selenium import webdriver
|
|||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.common.exceptions import NoSuchElementException
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
import platform
|
import platform
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
@ -12,8 +13,9 @@ import csv
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
class Scanner:
|
class Scanner:
|
||||||
def __init__(self, urls):
|
def __init__(self, urls, keywords):
|
||||||
self.urls = urls
|
self.urls = urls
|
||||||
|
self.keywords = keywords
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Check if the server returned status OK, the content type header is set and it is html
|
Check if the server returned status OK, the content type header is set and it is html
|
||||||
@ -25,50 +27,87 @@ class Scanner:
|
|||||||
"""
|
"""
|
||||||
Get the the first name and email pair on the page
|
Get the the first name and email pair on the page
|
||||||
"""
|
"""
|
||||||
def get_contact(self):
|
def get_contacts(self):
|
||||||
res = []
|
res = []
|
||||||
mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
|
mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
|
||||||
|
# Set properties for selenium
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument('--ignore-certificate-errors')
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||||||
|
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||||
|
|
||||||
# Loop through all pages
|
# Loop through all pages
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
with closing(get(url, stream=True)) as resp:
|
with closing(get(url, stream=True)) as resp:
|
||||||
if self.response_is_correct(resp):
|
if self.response_is_correct(resp):
|
||||||
# Set properties for selenium
|
print("################################")
|
||||||
chrome_options = Options()
|
print("INFO: Scraping {0}".format(url))
|
||||||
chrome_options.add_argument("--headless")
|
|
||||||
chrome_options.add_argument("--window-size=1920x1080")
|
# Open in selenium
|
||||||
driver = webdriver.Chrome(chrome_options=chrome_options)
|
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
|
||||||
time.sleep(2) # Give the page some time to load
|
# Click on Impressum
|
||||||
|
try:
|
||||||
|
impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
|
||||||
|
driver.execute_script("arguments[0].click()", impressum_link)
|
||||||
|
|
||||||
# Get the entire page body
|
# Get the entire page body
|
||||||
page_content = driver.execute_script("return document.body.innerHTML;")
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
person = {}
|
||||||
|
mail = ""
|
||||||
|
name = ""
|
||||||
|
for keyword in self.keywords:
|
||||||
|
person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
|
||||||
|
person_pattern = re.compile(person_regex)
|
||||||
|
# Find the first thing that matches the mail pattern and the keyword
|
||||||
|
mail = soup.find(text = mail_pattern)
|
||||||
|
name = soup.find(text = person_pattern)
|
||||||
|
|
||||||
for keyword in keywords:
|
if mail:
|
||||||
try:
|
person["Email"] = mail
|
||||||
person = {}
|
else:
|
||||||
person["Email"] = re.findall(mail_pattern, page_content)[0]
|
print("WARNING: Did not find email on {0}".format(url))
|
||||||
person["Name"] = "Meyer"
|
break
|
||||||
|
|
||||||
|
if name:
|
||||||
|
to_replace = '"\t\n'
|
||||||
|
for char in to_replace:
|
||||||
|
name = name.replace(char, "")
|
||||||
|
person["Name"] = name
|
||||||
|
res.append(person)
|
||||||
|
print("SUCCESS: Pair found on {0}".format(url))
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
|
||||||
|
|
||||||
|
if mail != None and name == None:
|
||||||
|
person["Name"] = name
|
||||||
|
person["Email"] = mail
|
||||||
res.append(person)
|
res.append(person)
|
||||||
break
|
print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
|
||||||
except NoSuchElementException:
|
|
||||||
print("INFO: No results for keyword {0} in {1}").format(keyword, url)
|
|
||||||
|
|
||||||
|
except NoSuchElementException as e:
|
||||||
|
print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
|
||||||
else:
|
else:
|
||||||
print("ERROR: The response did not pass the criteria for correctness")
|
print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
|
||||||
sys.exit(1)
|
|
||||||
except RequestException as e:
|
except RequestException as e:
|
||||||
print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
|
print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
|
||||||
|
|
||||||
# Write result to CSV
|
# Write result to CSV
|
||||||
try:
|
try:
|
||||||
keys = res[0].keys()
|
keys = res[0].keys()
|
||||||
with open('../people.csv', 'wb') as output_file:
|
with open('people.csv', 'w') as output_file:
|
||||||
|
# Empty file contents
|
||||||
|
output_file.truncate(0)
|
||||||
dict_writer = csv.DictWriter(output_file, keys)
|
dict_writer = csv.DictWriter(output_file, keys)
|
||||||
dict_writer.writeheader()
|
dict_writer.writeheader()
|
||||||
dict_writer.writerows(res)
|
dict_writer.writerows(res)
|
||||||
|
output_file.close()
|
||||||
|
print('SUCCESS: Successfully wrote to CSV file')
|
||||||
|
sys.exit(0)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('ERROR: Failed writing to CSV file: {0}').format(str(e))
|
print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
requests>=2.20.1
|
requests>=2.20.1
|
||||||
selenium>=3.141.0
|
selenium>=3.141.0
|
||||||
|
beautifulsoup4>=4.6.3
|
Loading…
Reference in New Issue
Block a user