102 lines
2.9 KiB
Python
Executable File
102 lines
2.9 KiB
Python
Executable File
# NOTE: Used for requesting web pages
|
|
import requests
|
|
|
|
# NOTE: Used for parsing web pages
|
|
from bs4 import BeautifulSoup
|
|
|
|
# NOTE: Generic imports
|
|
import base64
|
|
import mimetypes
|
|
import logging
|
|
|
|
logger = logging.getLogger("__main__." + __name__)
|
|
|
|
|
|
def downloadFile(url, text=True, mimeType=None):
|
|
"""
|
|
Download resource from url and convert it to text or a data url
|
|
"""
|
|
fbytes = b""
|
|
with requests.get(url, stream=True) as r:
|
|
r.raise_for_status()
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
fbytes += chunk
|
|
if text:
|
|
return fbytes.decode("utf-8")
|
|
else:
|
|
if mimeType == None:
|
|
mimeType, encoding = mimetypes.guess_type(url)
|
|
if mimeType == None:
|
|
raise ValueError(
|
|
"Couldnt guess mime type and none was supplied, cant encode to data url"
|
|
)
|
|
b64str = base64.b64encode(fbytes).decode("utf-8")
|
|
dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)
|
|
return dataUrl
|
|
|
|
|
|
def get(url: str, params=None, followTags=None):
|
|
"""
|
|
http/s get request
|
|
|
|
Parameters
|
|
----------
|
|
url: str
|
|
|
|
params
|
|
Requests (library) parameters
|
|
|
|
followTags
|
|
None or list of tags to download the src/href from
|
|
"""
|
|
logger.debug("Hopping to it")
|
|
# TODO: Non blocking requests
|
|
# WARN: Do not run self requests until this is fixed
|
|
r = requests.get(url, params=params)
|
|
logger.debug("Content retrieved, parsing")
|
|
r = {
|
|
"response": r.text,
|
|
"code": r.status_code,
|
|
"content-type": r.headers.get("content-type"),
|
|
}
|
|
logger.debug("Done parsing")
|
|
|
|
# TODO: Reject followtags if content type is other then html
|
|
if followTags != None:
|
|
soup = BeautifulSoup(r["response"], "html.parser")
|
|
# TODO: Checking for relative links
|
|
for tag in followTags:
|
|
if tag in ["img", "video"]:
|
|
for elem in soup.find_all(tag):
|
|
elem["src"] = downloadFile(elem["src"], text=False)
|
|
elif tag in ["link"]:
|
|
for elem in soup.find_all(tag):
|
|
if elem["rel"] == "stylesheet":
|
|
style = downloadFile(elem["href"])
|
|
elem.decompose()
|
|
soup.head.append_tag(soup.new_tag("style", string=style))
|
|
elif tag == "script":
|
|
for elem in soup.find_all(tag):
|
|
script = downloadFile(elem["src"])
|
|
elem["src"] = ""
|
|
elem.string = script
|
|
r["response"] = soup.text
|
|
logger.debug("Done hopping")
|
|
return r
|
|
|
|
|
|
def post(url: str, params=None):
|
|
"""
|
|
http/s post request
|
|
|
|
Parameters
|
|
----------
|
|
url: str
|
|
|
|
params
|
|
Requests (library) parameters
|
|
"""
|
|
r = requests.post(url, data=params)
|
|
r = {"response": r.text, "code": r.status_code}
|
|
return r
|