From d279e4531a786374d52ee970394a247f9b707f77 Mon Sep 17 00:00:00 2001
From: Ivaylo Ivanov <me@ivayloivanov.eu>
Date: Tue, 18 Dec 2018 22:55:28 +0100
Subject: [PATCH] Add initial scraping capabilities

---
 .gitignore       |  6 +++-
 README.md        | 43 ++++++++++++++++++++++++
 app.py           | 26 ++++++++++++++-
 lib/__init__.py  |  0
 lib/scanner.py   | 87 +++++++++++++++++++++++++++++++++++-------------
 requirements.txt |  3 +-
 6 files changed, 138 insertions(+), 27 deletions(-)
 create mode 100644 README.md
 create mode 100644 lib/__init__.py

diff --git a/.gitignore b/.gitignore
index dbe9c82..5a037c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,5 @@
-.vscode/
\ No newline at end of file
+.vscode/
+*.csv
+*.txt
+!requirements.txt
+__pycache__/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8a6ee6c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+# Contact Scanner
+## What is this?
+The project is a small python web scraper with Selenium and BeautifulSoup.
+
+## What does it do?
+The scraper goes to the impressum page of a given website and scans it for an email address and a name, following the keywords defined in a supplied file. After it scrapes the page, it writes the results in a csv file.
+
+**NOTE:** The scraper does **NOT** return a 100% correct email-name pairs. It returns the pairs that it can **build**. This means that you should always take the results with a grain of salt.
+
+## How to use it?
+### Prerequisites
+You are going to need the following things installed:
+* Chrome
+* Python 3
+* Pip3
+* Selenium Chrome driver
+
+After you have these 4 installed, go on.
+### Dependecies
+The dependencies are listed in [requirements.txt](requirements.txt). Install them with the following command:
+```
+pip3 install -r requirements.txt
+```
+
+### Usage
+The application has the following synopsis:
+```
+SYNOPSIS
+
+python3 app.py URL_FILE KEYWORD_FILE
+```
+
+where ```URL_FILE``` is a file with a list of URLs that should be scanned with each URL on new line and ```KEYWORD_FILE``` contains a list of keywords based on which you will search for names. The format of the file is the same(you should trim the trailing whitespaces for best results).
+
+### Usage constraints
+You should **NOT**
+1. use this scraper for generating spam lists
+2. use this scraper without acknowledging the `robots.txt` of the target
+3. use this scraper when you have explicitly agreed with the website not to scrape it
+4. use this scraper if you're not using it under fair use
+
+## Fair use
+The scraper falls under fair use because it is designed to search for *facts* in pages and not for *content*
\ No newline at end of file
diff --git a/app.py b/app.py
index 99e6c4a..f49de01 100644
--- a/app.py
+++ b/app.py
@@ -1,3 +1,27 @@
+import sys
 from lib.scanner import Scanner
 
-print("Yet to be implemented")
+if len(sys.argv) != 3:
+  print("Incorrect number of arguments supplied. Usage:")
+  print("python3 app.py URL_FILE KEYWORD_FILE")
+  sys.exit(0)
+
+# Get filenames
+url_filename = sys.argv[1]
+keyword_filename = sys.argv[2]
+
+# Open the url file and get the list of URLs
+url_file = open(url_filename, 'r')
+urls = url_file.read().split('\n')
+
+# Replace spaces
+for url in urls:
+  url = url.replace(" ", "")
+
+# Open the keyword file and get the list of keywords
+keyword_file = open(keyword_filename, 'r')
+keywords = keyword_file.read().split('\n')
+
+# Scan the contacts in the URL
+contact_scanner = Scanner(urls, keywords)
+contact_scanner.get_contacts()
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lib/scanner.py b/lib/scanner.py
index 5d2673b..171889d 100644
--- a/lib/scanner.py
+++ b/lib/scanner.py
@@ -5,6 +5,7 @@ from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import NoSuchElementException
+from bs4 import BeautifulSoup
 import platform
 import time
 import re
@@ -12,8 +13,9 @@ import csv
 import sys
 
 class Scanner:
-  def __init__(self, urls):
+  def __init__(self, urls, keywords):
     self.urls = urls
+    self.keywords = keywords
 
   """
     Check if the server returned status OK, the content type header is set and it is html
@@ -25,50 +27,87 @@ class Scanner:
   """
     Get the the first name and email pair on the page
   """
-  def get_contact(self):
+  def get_contacts(self):
     res = []
-    mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
+    mail_pattern = re.compile(r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$")
+    # Set properties for selenium
+    chrome_options = Options()
+    chrome_options.add_argument('--ignore-certificate-errors')
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--window-size=1920x1080")
+    driver = webdriver.Chrome(chrome_options=chrome_options)
+
     # Loop through all pages
     for url in self.urls:
       try:
         with closing(get(url, stream=True)) as resp:
           if self.response_is_correct(resp):
-            # Set properties for selenium
-            chrome_options = Options()
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--window-size=1920x1080")
-            driver = webdriver.Chrome(chrome_options=chrome_options)
+            print("################################")
+            print("INFO: Scraping {0}".format(url))
+
+            # Open in selenium
             driver.get(url)
 
-            time.sleep(2) # Give the page some time to load
+            # Click on Impressum
+            try:
+              impressum_link = driver.find_element(By.XPATH, '//*[contains(text(), "impressum") or contains(text(), "Impressum") or contains(text(),"IMPRESSUM")]')
+              driver.execute_script("arguments[0].click()", impressum_link)
 
-            # Get the entire page body
-            page_content = driver.execute_script("return document.body.innerHTML;")
+              # Get the entire page body
+              soup = BeautifulSoup(driver.page_source, 'html.parser')
+              person = {}
+              mail = ""
+              name = ""
+              for keyword in self.keywords:
+                person_regex = r"" + keyword + r"[ a-zA-Z-.]+[^\/\\#`~\n]"
+                person_pattern = re.compile(person_regex)
+                # Find the first thing that matches the mail pattern and the keyword
+                mail = soup.find(text = mail_pattern)
+                name = soup.find(text = person_pattern)
 
-            for keyword in keywords:
-              try:
-                person = {}
-                person["Email"] = re.findall(mail_pattern, page_content)[0]
-                person["Name"] = "Meyer"
+                if mail:
+                  person["Email"] = mail
+                else:
+                  print("WARNING: Did not find email on {0}".format(url))
+                  break
+
+                if name:
+                  to_replace = '"\t\n'
+                  for char in to_replace:
+                    name = name.replace(char, "")
+                  person["Name"] = name
+                  res.append(person)
+                  print("SUCCESS: Pair found on {0}".format(url))
+                  break
+                else:
+                  print("WARNING: Did not find keyword {0} on {1}".format(keyword, url))
+
+              if mail != None and name == None:
+                person["Name"] = name
+                person["Email"] = mail
                 res.append(person)
-                break
-              except NoSuchElementException:
-                print("INFO: No results for keyword {0} in {1}").format(keyword, url)
+                print("INFO: No keyword matches found for {0}. Appending only Email...".format(url))
 
+            except NoSuchElementException as e:
+              print('WARNING: Could not find Impressum link on {0}: {1}'.format(url, str(e)))
           else:
-            print("ERROR: The response did not pass the criteria for correctness")
-            sys.exit(1)
+            print("ERROR: The response from {0} did not pass the criteria for correctness".format(url))
       except RequestException as e:
-        print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
+        print('ERROR: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
 
     # Write result to CSV
     try:
       keys = res[0].keys()
-      with open('../people.csv', 'wb') as output_file:
+      with open('people.csv', 'w') as output_file:
+        # Empty file contents
+        output_file.truncate(0)
         dict_writer = csv.DictWriter(output_file, keys)
         dict_writer.writeheader()
         dict_writer.writerows(res)
+        output_file.close()
+        print('SUCCESS: Successfully wrote to CSV file')
+        sys.exit(0)
     except Exception as e:
-      print('ERROR: Failed writing to CSV file: {0}').format(str(e))
+      print('FATAL: Failed writing to CSV file: {0}'.format(str(e)))
       sys.exit(1)
 
diff --git a/requirements.txt b/requirements.txt
index 46ac4fd..99c85fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 requests>=2.20.1
-selenium>=3.141.0
\ No newline at end of file
+selenium>=3.141.0
+beautifulsoup4>=4.6.3
\ No newline at end of file