scanner.py 2.47 KB
Newer Older
Ivaylo Ivanov's avatar
Ivaylo Ivanov committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import platform
import time
import re
import csv
import sys

class Scanner:
  def __init__(self, urls):
    self.urls = urls

  """
    Check if the server returned status OK, the content type header is set and it is html
  """
  def response_is_correct(self, resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

  """
    Get the the first name and email pair on the page
  """
  def get_contact(self):
    res = []
    mail_pattern = re.compile(r"[^@\s]+@[^\.\s]+\..+[^\s]")
    # Loop through all pages
    for url in self.urls:
      try:
        with closing(get(url, stream=True)) as resp:
          if self.response_is_correct(resp):
            # Set properties for selenium
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--window-size=1920x1080")
            driver = webdriver.Chrome(chrome_options=chrome_options)
            driver.get(url)

            time.sleep(2) # Give the page some time to load

            # Get the entire page body
            page_content = driver.execute_script("return document.body.innerHTML;")

            for keyword in keywords:
              try:
                person = {}
                person["Email"] = re.findall(mail_pattern, page_content)[0]
                person["Name"] = "Meyer"
                res.append(person)
                break
              except NoSuchElementException:
                print("INFO: No results for keyword {0} in {1}").format(keyword, url)

          else:
            print("ERROR: The response did not pass the criteria for correctness")
            sys.exit(1)
      except RequestException as e:
        print('WARNING: Did not succeed sending request to {0}: {1}'.format(url, str(e)))

    # Write result to CSV
    try:
      keys = res[0].keys()
      with open('../people.csv', 'wb') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(res)
    except Exception as e:
      print('ERROR: Failed writing to CSV file: {0}').format(str(e))
      sys.exit(1)