Compare commits
2 Commits
67d4f5a19b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47a4850f00 | ||
|
|
f1e76d8d41 |
21
README.md
21
README.md
@@ -1 +1,20 @@
|
|||||||
# Please use this read me file for the information required to be submitted per the assignment on MyCourses.
|
# HTTP JavaScript Scraper
|
||||||
|
|
||||||
|
Run the code with the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# python3 main.py {proto}://{site}
|
||||||
|
python3 main.py https://www.nintendo.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Any site that only serves HTTP/2 content will return a 301 HTTP response code.
|
||||||
|
It will return a 301 trying to redirect you to HTTP/2 on the same address and port.
|
||||||
|
|
||||||
|
Any status code other than 200 will be reported back to the user without any additional processing.
|
||||||
|
The output of the program will contain the unique number of external resources.
|
||||||
|
If two files are referenced from `abc.com` they will only count as one external reference.
|
||||||
|
Any reference that shares the same base URL will count once.
|
||||||
|
A full list of references will also be printed out regardless of the uniqueness.
|
||||||
|
|
||||||
|
No additional dependencies are required to run the program.
|
||||||
|
|
||||||
|
|||||||
10
main.py
10
main.py
@@ -18,7 +18,7 @@ def generate_request():
|
|||||||
global USER_AGENT
|
global USER_AGENT
|
||||||
global REQUEST
|
global REQUEST
|
||||||
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
|
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
|
||||||
REQUEST = "GET / HTTP/1.1\r\nHost: %s\r\nConnection: close\r\nUser-Agent: %s\r\n\r\n" % (HOST_REQUEST, USER_AGENT)
|
REQUEST = "GET / HTTP/1.1\r\nHost: %s\r\nConnection: close\r\nAccept-Encoding: Identity\r\nUser-Agent: %s\r\n\r\n" % (HOST_REQUEST, USER_AGENT)
|
||||||
return
|
return
|
||||||
|
|
||||||
def pull_external_url(text):
|
def pull_external_url(text):
|
||||||
@@ -159,6 +159,12 @@ def conn_type_parse():
|
|||||||
return conn_port
|
return conn_port
|
||||||
|
|
||||||
def check_connection_succ(text):
|
def check_connection_succ(text):
|
||||||
|
"""
|
||||||
|
Exits program if any HTTP responce code other than 200 is met
|
||||||
|
:param str: full HTTP responce
|
||||||
|
"""
|
||||||
|
# If the website is using HTTP/2 and has no HTTP/1.1 options
|
||||||
|
# you will receive an error here since it returns a 301 code
|
||||||
if text.split("\n")[0].find("200") == -1:
|
if text.split("\n")[0].find("200") == -1:
|
||||||
print("Error on HTTP request")
|
print("Error on HTTP request")
|
||||||
print("HTTP Return Code: %s" % text.split("\n")[0])
|
print("HTTP Return Code: %s" % text.split("\n")[0])
|
||||||
@@ -181,7 +187,7 @@ def main():
|
|||||||
# Obtain data from response
|
# Obtain data from response
|
||||||
fr = ""
|
fr = ""
|
||||||
while True:
|
while True:
|
||||||
r = w.recv(8192)
|
r = w.recv(32768)
|
||||||
if not r:
|
if not r:
|
||||||
break
|
break
|
||||||
fr += r.decode()
|
fr += r.decode()
|
||||||
|
|||||||
Reference in New Issue
Block a user