Add initial scraping capabilities

This commit is contained in:
Ivaylo Ivanov 2018-12-18 22:55:28 +01:00
parent 7fbf7e296b
commit d279e4531a
6 changed files with 138 additions and 27 deletions

4
.gitignore vendored
View File

@ -1 +1,5 @@
.vscode/ .vscode/
*.csv
*.txt
!requirements.txt
__pycache__/

43
README.md Normal file
View File

@ -0,0 +1,43 @@
# Contact Scanner
## What is this?
The project is a small python web scraper with Selenium and BeautifulSoup.
## What does it do?
The scraper goes to the impressum page of a given website and scans it for an email address and a name, following the keywords defined in a supplied file. After it scrapes the page, it writes the results in a csv file.
**NOTE:** The scraper does **NOT** return a 100% correct email-name pairs. It returns the pairs that it can **build**. This means that you should always take the results with a grain of salt.
## How to use it?
### Prerequisites
You are going to need the following things installed:
* Chrome
* Python 3
* Pip3
* Selenium Chrome driver
After you have these 4 installed, go on.
### Dependecies
The dependencies are listed in [requirements.txt](requirements.txt). Install them with the following command:
```
pip3 install -r requirements.txt
```
### Usage
The application has the following synopsis:
```
SYNOPSIS
python3 app.py URL_FILE KEYWORD_FILE
```
where ```URL_FILE``` is a file with a list of URLs that should be scanned with each URL on new line and ```KEYWORD_FILE``` contains a list of keywords based on which you will search for names. The format of the file is the same(you should trim the trailing whitespaces for best results).
### Usage constraints
You should **NOT**
1. use this scraper for generating spam lists
2. use this scraper without acknowledging the `robots.txt` of the target
3. use this scraper when you have explicitly agreed with the website not to scrape it
4. use this scraper if you're not using it under fair use
## Fair use
The scraper falls under fair use because it is designed to search for *facts* in pages and not for *content*

26
app.py
View File

@ -1,3 +1,27 @@
import sys
from lib.scanner import Scanner from lib.scanner import Scanner
print("Yet to be implemented") if len(sys.argv) != 3:
print("Incorrect number of arguments supplied. Usage:")
print("python3 app.py URL_FILE KEYWORD_FILE")
sys.exit(0)
# Get filenames
url_filename = sys.argv[1]
keyword_filename = sys.argv[2]
# Open the url file and get the list of URLs
url_file = open(url_filename, 'r')
urls = url_file.read().split('\n')
# Replace spaces
for url in urls:
url = url.replace(" ", "")
# Open the keyword file and get the list of keywords
keyword_file = open(keyword_filename, 'r')
keywords = keyword_file.read().split('\n')
# Scan the contacts in the URL
contact_scanner = Scanner(urls, keywords)
contact_scanner.get_contacts()

0
lib/__init__.py Normal file
View File

View File

@ -5,6 +5,7 @@ from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import platform import platform
import time import time
import re import re
@ -12,8 +13,9 @@ import csv
import sys import sys
class Scanner: class Scanner:
def __init__(self, urls): def __init__(self, urls, keywords):
self.urls = urls self.urls = urls
self.keywords = keywords
""" """
Check if the server returned status OK, the content type header is set and it is html Check if the server returned status OK, the content type header is set and it is html
@ -25,50 +27,87 @@ class Scanner:
""" """
Get the the first name and email pair on the page Get the the first name and email pair on the page
""" """
def get_contact(self): def get_contacts(self):
res = [] res = []
mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]") mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
# Set properties for selenium
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options)
# Loop through all pages # Loop through all pages
for url in self.urls: for url in self.urls:
try: try:
with closing(get(url, stream=True)) as resp: with closing(get(url, stream=True)) as resp:
if self.response_is_correct(resp): if self.response_is_correct(resp):
# Set properties for selenium print("################################")
chrome_options = Options() print("INFO: Scraping {0}".format(url))
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080") # Open in selenium
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url) driver.get(url)
time.sleep(2) # Give the page some time to load # Click on Impressum
try:
impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
driver.execute_script("arguments[0].click()", impressum_link)
# Get the entire page body # Get the entire page body
page_content = driver.execute_script("return document.body.innerHTML;") soup = BeautifulSoup(driver.page_source, 'html.parser')
for keyword in keywords:
try:
person = {} person = {}
person["Email"] = re.findall(mail_pattern, page_content)[0] mail = ""
person["Name"] = "Meyer" name = ""
res.append(person) for keyword in self.keywords:
break person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
except NoSuchElementException: person_pattern = re.compile(person_regex)
print("INFO: No results for keyword {0} in {1}").format(keyword, url) # Find the first thing that matches the mail pattern and the keyword
mail = soup.find(text = mail_pattern)
name = soup.find(text = person_pattern)
if mail:
person["Email"] = mail
else: else:
print("ERROR: The response did not pass the criteria for correctness") print("WARNING: Did not find email on {0}".format(url))
sys.exit(1) break
if name:
to_replace = '"\t\n'
for char in to_replace:
name = name.replace(char, "")
person["Name"] = name
res.append(person)
print("SUCCESS: Pair found on {0}".format(url))
break
else:
print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
if mail != None and name == None:
person["Name"] = name
person["Email"] = mail
res.append(person)
print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
except NoSuchElementException as e:
print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
else:
print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
except RequestException as e: except RequestException as e:
print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e))) print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
# Write result to CSV # Write result to CSV
try: try:
keys = res[0].keys() keys = res[0].keys()
with open('../people.csv', 'wb') as output_file: with open('people.csv', 'w') as output_file:
# Empty file contents
output_file.truncate(0)
dict_writer = csv.DictWriter(output_file, keys) dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader() dict_writer.writeheader()
dict_writer.writerows(res) dict_writer.writerows(res)
output_file.close()
print('SUCCESS: Successfully wrote to CSV file')
sys.exit(0)
except Exception as e: except Exception as e:
print('ERROR: Failed writing to CSV file: {0}').format(str(e)) print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
sys.exit(1) sys.exit(1)

View File

@ -1,2 +1,3 @@
requests>=2.20.1 requests>=2.20.1
selenium>=3.141.0 selenium>=3.141.0
beautifulsoup4>=4.6.3