From d279e4531a786374d52ee970394a247f9b707f77 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Tue, 18 Dec 2018 22:55:28 +0100 Subject: [PATCH] Add initial scraping capabilities --- .gitignore | 6 +++- README.md | 43 ++++++++++++++++++++++++ app.py | 26 ++++++++++++++- lib/__init__.py | 0 lib/scanner.py | 87 +++++++++++++++++++++++++++++++++++------------- requirements.txt | 3 +- 6 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 README.md create mode 100644 lib/__init__.py diff --git a/.gitignore b/.gitignore index dbe9c82..5a037c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ -.vscode/ \ No newline at end of file +.vscode/ +*.csv +*.txt +!requirements.txt +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8a6ee6c --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Contact Scanner +## What is this? +The project is a small python web scraper with Selenium and BeautifulSoup. + +## What does it do? +The scraper goes to the impressum page of a given website and scans it for an email address and a name, following the keywords defined in a supplied file. After it scrapes the page, it writes the results in a csv file. + +**NOTE:** The scraper does **NOT** return a 100% correct email-name pairs. It returns the pairs that it can **build**. This means that you should always take the results with a grain of salt. + +## How to use it? +### Prerequisites +You are going to need the following things installed: +* Chrome +* Python 3 +* Pip3 +* Selenium Chrome driver + +After you have these 4 installed, go on. +### Dependecies +The dependencies are listed in [requirements.txt](requirements.txt). Install them with the following command: +``` +pip3 install -r requirements.txt +``` + +### Usage +The application has the following synopsis: +``` +SYNOPSIS + +python3 app.py URL_FILE KEYWORD_FILE +``` + +where ```URL_FILE``` is a file with a list of URLs that should be scanned with each URL on new line and ```KEYWORD_FILE``` contains a list of keywords based on which you will search for names. The format of the file is the same(you should trim the trailing whitespaces for best results). + +### Usage constraints +You should **NOT** +1. use this scraper for generating spam lists +2. use this scraper without acknowledging the `robots.txt` of the target +3. use this scraper when you have explicitly agreed with the website not to scrape it +4. use this scraper if you're not using it under fair use + +## Fair use +The scraper falls under fair use because it is designed to search for *facts* in pages and not for *content* \ No newline at end of file diff --git a/app.py b/app.py index 99e6c4a..f49de01 100644 --- a/app.py +++ b/app.py @@ -1,3 +1,27 @@ +import sys from lib.scanner import Scanner -print("Yet to be implemented") +if len(sys.argv) != 3: + print("Incorrect number of arguments supplied. Usage:") + print("python3 app.py URL_FILE KEYWORD_FILE") + sys.exit(0) + +# Get filenames +url_filename = sys.argv[1] +keyword_filename = sys.argv[2] + +# Open the url file and get the list of URLs +url_file = open(url_filename, 'r') +urls = url_file.read().split('\n') + +# Replace spaces +for url in urls: + url = url.replace(" ", "") + +# Open the keyword file and get the list of keywords +keyword_file = open(keyword_filename, 'r') +keywords = keyword_file.read().split('\n') + +# Scan the contacts in the URL +contact_scanner = Scanner(urls, keywords) +contact_scanner.get_contacts() diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/scanner.py b/lib/scanner.py index 5d2673b..171889d 100644 --- a/lib/scanner.py +++ b/lib/scanner.py @@ -5,6 +5,7 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException +from bs4 import BeautifulSoup import platform import time import re @@ -12,8 +13,9 @@ import csv import sys class Scanner: - def __init__(self, urls): + def __init__(self, urls, keywords): self.urls = urls + self.keywords = keywords """ Check if the server returned status OK, the content type header is set and it is html @@ -25,50 +27,87 @@ class Scanner: """ Get the the first name and email pair on the page """ - def get_contact(self): + def get_contacts(self): res = [] - mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]") + mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$") + # Set properties for selenium + chrome_options = Options() + chrome_options.add_argument('--ignore-certificate-errors') + chrome_options.add_argument("--headless") + chrome_options.add_argument("--window-size=1920x1080") + driver = webdriver.Chrome(chrome_options=chrome_options) + # Loop through all pages for url in self.urls: try: with closing(get(url, stream=True)) as resp: if self.response_is_correct(resp): - # Set properties for selenium - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--window-size=1920x1080") - driver = webdriver.Chrome(chrome_options=chrome_options) + print("################################") + print("INFO: Scraping {0}".format(url)) + + # Open in selenium driver.get(url) - time.sleep(2) # Give the page some time to load + # Click on Impressum + try: + impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]') + driver.execute_script("arguments[0].click()", impressum_link) - # Get the entire page body - page_content = driver.execute_script("return document.body.innerHTML;") + # Get the entire page body + soup = BeautifulSoup(driver.page_source, 'html.parser') + person = {} + mail = "" + name = "" + for keyword in self.keywords: + person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]" + person_pattern = re.compile(person_regex) + # Find the first thing that matches the mail pattern and the keyword + mail = soup.find(text = mail_pattern) + name = soup.find(text = person_pattern) - for keyword in keywords: - try: - person = {} - person["Email"] = re.findall(mail_pattern, page_content)[0] - person["Name"] = "Meyer" + if mail: + person["Email"] = mail + else: + print("WARNING: Did not find email on {0}".format(url)) + break + + if name: + to_replace = '"\t\n' + for char in to_replace: + name = name.replace(char, "") + person["Name"] = name + res.append(person) + print("SUCCESS: Pair found on {0}".format(url)) + break + else: + print("WARNING: Did not find keyword {0} on {1}".format(keyword, url)) + + if mail != None and name == None: + person["Name"] = name + person["Email"] = mail res.append(person) - break - except NoSuchElementException: - print("INFO: No results for keyword {0} in {1}").format(keyword, url) + print("INFO: No keyword matches found for {0}. Appending only Email...".format(url)) + except NoSuchElementException as e: + print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e))) else: - print("ERROR: The response did not pass the criteria for correctness") - sys.exit(1) + print("ERROR: The response from {0} did not pass the criteria for correctness".format(url)) except RequestException as e: - print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e))) + print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e))) # Write result to CSV try: keys = res[0].keys() - with open('../people.csv', 'wb') as output_file: + with open('people.csv', 'w') as output_file: + # Empty file contents + output_file.truncate(0) dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(res) + output_file.close() + print('SUCCESS: Successfully wrote to CSV file') + sys.exit(0) except Exception as e: - print('ERROR: Failed writing to CSV file: {0}').format(str(e)) + print('FATAL: Failed writing to CSV file: {0}'.format(str(e))) sys.exit(1) diff --git a/requirements.txt b/requirements.txt index 46ac4fd..99c85fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests>=2.20.1 -selenium>=3.141.0 \ No newline at end of file +selenium>=3.141.0 +beautifulsoup4>=4.6.3 \ No newline at end of file