Add initial scanner
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
.vscode/
 | 
			
		||||
							
								
								
									
										3
									
								
								app.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								app.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
from lib.scanner import Scanner
 | 
			
		||||
 | 
			
		||||
print("Yet to be implemented")
 | 
			
		||||
							
								
								
									
										74
									
								
								lib/scanner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								lib/scanner.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
			
		||||
from requests import get
 | 
			
		||||
from requests.exceptions import RequestException
 | 
			
		||||
from contextlib import closing
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
from selenium.webdriver.chrome.options import Options
 | 
			
		||||
from selenium.webdriver.common.by import By
 | 
			
		||||
from selenium.common.exceptions import NoSuchElementException
 | 
			
		||||
import platform
 | 
			
		||||
import time
 | 
			
		||||
import re
 | 
			
		||||
import csv
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
class Scanner:
 | 
			
		||||
  def __init__(self, urls):
 | 
			
		||||
    self.urls = urls
 | 
			
		||||
 | 
			
		||||
  """
 | 
			
		||||
    Check if the server returned status OK, the content type header is set and it is html
 | 
			
		||||
  """
 | 
			
		||||
  def response_is_correct(self, resp):
 | 
			
		||||
    content_type = resp.headers['Content-Type'].lower()
 | 
			
		||||
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)
 | 
			
		||||
 | 
			
		||||
  """
 | 
			
		||||
    Get the the first name and email pair on the page
 | 
			
		||||
  """
 | 
			
		||||
  def get_contact(self):
 | 
			
		||||
    res = []
 | 
			
		||||
    mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
 | 
			
		||||
    # Loop through all pages
 | 
			
		||||
    for url in self.urls:
 | 
			
		||||
      try:
 | 
			
		||||
        with closing(get(url, stream=True)) as resp:
 | 
			
		||||
          if self.response_is_correct(resp):
 | 
			
		||||
            # Set properties for selenium
 | 
			
		||||
            chrome_options = Options()
 | 
			
		||||
            chrome_options.add_argument("--headless")
 | 
			
		||||
            chrome_options.add_argument("--window-size=1920x1080")
 | 
			
		||||
            driver = webdriver.Chrome(chrome_options=chrome_options)
 | 
			
		||||
            driver.get(url)
 | 
			
		||||
 | 
			
		||||
            time.sleep(2) # Give the page some time to load
 | 
			
		||||
 | 
			
		||||
            # Get the entire page body
 | 
			
		||||
            page_content = driver.execute_script("return document.body.innerHTML;")
 | 
			
		||||
 | 
			
		||||
            for keyword in keywords:
 | 
			
		||||
              try:
 | 
			
		||||
                person = {}
 | 
			
		||||
                person["Email"] = re.findall(mail_pattern, page_content)[0]
 | 
			
		||||
                person["Name"] = "Meyer"
 | 
			
		||||
                res.append(person)
 | 
			
		||||
                break
 | 
			
		||||
              except NoSuchElementException:
 | 
			
		||||
                print("INFO: No results for keyword {0} in {1}").format(keyword, url)
 | 
			
		||||
 | 
			
		||||
          else:
 | 
			
		||||
            print("ERROR: The response did not pass the criteria for correctness")
 | 
			
		||||
            sys.exit(1)
 | 
			
		||||
      except RequestException as e:
 | 
			
		||||
        print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))
 | 
			
		||||
 | 
			
		||||
    # Write result to CSV
 | 
			
		||||
    try:
 | 
			
		||||
      keys = res[0].keys()
 | 
			
		||||
      with open('../people.csv', 'wb') as output_file:
 | 
			
		||||
        dict_writer = csv.DictWriter(output_file, keys)
 | 
			
		||||
        dict_writer.writeheader()
 | 
			
		||||
        dict_writer.writerows(res)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
      print('ERROR: Failed writing to CSV file: {0}').format(str(e))
 | 
			
		||||
      sys.exit(1)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
			
		||||
requests>=2.20.1
 | 
			
		||||
selenium>=3.141.0
 | 
			
		||||
		Reference in New Issue
	
	Block a user