Compare commits
2 Commits
67d4f5a19b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47a4850f00 | ||
|
|
f1e76d8d41 |
21
README.md
21
README.md
@@ -1 +1,20 @@
|
||||
# Please use this read me file for the information required to be submitted per the assignment on MyCourses.
|
||||
# HTTP JavaScript Scraper
|
||||
|
||||
Run the code with the following:
|
||||
|
||||
```bash
|
||||
# python3 main.py {proto}://{site}
|
||||
python3 main.py https://www.nintendo.com
|
||||
```
|
||||
|
||||
Any site that only serves HTTP/2 content will return a 301 HTTP response code.
|
||||
It will return a 301 trying to redirect you to HTTP/2 on the same address and port.
|
||||
|
||||
Any status code other than 200 will be reported back to the user without any additional processing.
|
||||
The output of the program will contain the unique number of external resources.
|
||||
If two files are referenced from `abc.com` they will only count as one external reference.
|
||||
Any reference that shares the same base URL will count once.
|
||||
A full list of references will also be printed out regardless of the uniqueness.
|
||||
|
||||
No additional dependencies are required to run the program.
|
||||
|
||||
|
||||
10
main.py
10
main.py
@@ -18,7 +18,7 @@ def generate_request():
|
||||
global USER_AGENT
|
||||
global REQUEST
|
||||
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
|
||||
REQUEST = "GET / HTTP/1.1\r\nHost: %s\r\nConnection: close\r\nUser-Agent: %s\r\n\r\n" % (HOST_REQUEST, USER_AGENT)
|
||||
REQUEST = "GET / HTTP/1.1\r\nHost: %s\r\nConnection: close\r\nAccept-Encoding: Identity\r\nUser-Agent: %s\r\n\r\n" % (HOST_REQUEST, USER_AGENT)
|
||||
return
|
||||
|
||||
def pull_external_url(text):
|
||||
@@ -159,6 +159,12 @@ def conn_type_parse():
|
||||
return conn_port
|
||||
|
||||
def check_connection_succ(text):
|
||||
"""
|
||||
Exits program if any HTTP responce code other than 200 is met
|
||||
:param str: full HTTP responce
|
||||
"""
|
||||
# If the website is using HTTP/2 and has no HTTP/1.1 options
|
||||
# you will receive an error here since it returns a 301 code
|
||||
if text.split("\n")[0].find("200") == -1:
|
||||
print("Error on HTTP request")
|
||||
print("HTTP Return Code: %s" % text.split("\n")[0])
|
||||
@@ -181,7 +187,7 @@ def main():
|
||||
# Obtain data from response
|
||||
fr = ""
|
||||
while True:
|
||||
r = w.recv(8192)
|
||||
r = w.recv(32768)
|
||||
if not r:
|
||||
break
|
||||
fr += r.decode()
|
||||
|
||||
Reference in New Issue
Block a user