Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
contact-scan
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Analytics
Analytics
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
Ivaylo Ivanov
contact-scan
Commits
7fbf7e29
Commit
7fbf7e29
authored
Dec 18, 2018
by
Ivaylo Ivanov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add initial scanner
parents
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
82 additions
and
0 deletions
+82
-0
.gitignore
.gitignore
+2
-0
app.py
app.py
+3
-0
lib/scanner.py
lib/scanner.py
+74
-0
requirements.txt
requirements.txt
+3
-0
No files found.
.gitignore
0 → 100644
View file @
7fbf7e29
.vscode/
\ No newline at end of file
app.py
0 → 100644
View file @
7fbf7e29
from
lib.scanner
import
Scanner
print
(
"Yet to be implemented"
)
lib/scanner.py
0 → 100644
View file @
7fbf7e29
from
requests
import
get
from
requests.exceptions
import
RequestException
from
contextlib
import
closing
from
selenium
import
webdriver
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.common.by
import
By
from
selenium.common.exceptions
import
NoSuchElementException
import
platform
import
time
import
re
import
csv
import
sys
class
Scanner
:
def
__init__
(
self
,
urls
):
self
.
urls
=
urls
"""
Check if the server returned status OK, the content type header is set and it is html
"""
def
response_is_correct
(
self
,
resp
):
content_type
=
resp
.
headers
[
'Content-Type'
].
lower
()
return
(
resp
.
status_code
==
200
and
content_type
is
not
None
and
content_type
.
find
(
'html'
)
>
-
1
)
"""
Get the the first name and email pair on the page
"""
def
get_contact
(
self
):
res
=
[]
mail_pattern
=
re
.
compile
(
r"[^@\s]+@[^\.\s]+\..+[^\s]"
)
# Loop through all pages
for
url
in
self
.
urls
:
try
:
with
closing
(
get
(
url
,
stream
=
True
))
as
resp
:
if
self
.
response_is_correct
(
resp
):
# Set properties for selenium
chrome_options
=
Options
()
chrome_options
.
add_argument
(
"--headless"
)
chrome_options
.
add_argument
(
"--window-size=1920x1080"
)
driver
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
)
driver
.
get
(
url
)
time
.
sleep
(
2
)
# Give the page some time to load
# Get the entire page body
page_content
=
driver
.
execute_script
(
"return document.body.innerHTML;"
)
for
keyword
in
keywords
:
try
:
person
=
{}
person
[
"Email"
]
=
re
.
findall
(
mail_pattern
,
page_content
)[
0
]
person
[
"Name"
]
=
"Meyer"
res
.
append
(
person
)
break
except
NoSuchElementException
:
print
(
"INFO: No results for keyword {0} in {1}"
).
format
(
keyword
,
url
)
else
:
print
(
"ERROR: The response did not pass the criteria for correctness"
)
sys
.
exit
(
1
)
except
RequestException
as
e
:
print
(
'WARNING: Did not succeed sending request to {0}: {1}'
.
format
(
url
,
str
(
e
)))
# Write result to CSV
try
:
keys
=
res
[
0
].
keys
()
with
open
(
'../people.csv'
,
'wb'
)
as
output_file
:
dict_writer
=
csv
.
DictWriter
(
output_file
,
keys
)
dict_writer
.
writeheader
()
dict_writer
.
writerows
(
res
)
except
Exception
as
e
:
print
(
'ERROR: Failed writing to CSV file: {0}'
).
format
(
str
(
e
))
sys
.
exit
(
1
)
requirements.txt
0 → 100644
View file @
7fbf7e29
requests
>=2.20.1
selenium
>=3.141.0
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment